In [8]:
#import pandas 
import pandas as pd
#import numpy
import numpy as np
#import warning
import warnings
warnings.filterwarnings('ignore')

# Set iPython's max column width to 1000
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)
pd.options.display.float_format = '{:,.0f}'.format

In [None]:
#load in json file 
file = 'disresort3.json'
df = pd.read_json(file,lines = True, orient= 'columns')

In [None]:
#take columns for the sentiment analysis
df = df[['text','id',
          'lang','created_at',
          'user','source','retweeted_status',
          'extended_tweet', 'entities']]

In [None]:
#retweeted status nested dictionary 
df['rt_created_at'] = [d.get('created_at') if type(d) == dict else np.nan
                        for d in df['retweeted_status']]
df['rt_id'] =  [d.get('id') if type(d) == dict else np.nan
                 for d in df['retweeted_status']]
df['rt_text'] = [d.get('text') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]
df['rt_source'] = [d.get('source') if type(d) == dict else np.nan
                    for d in df['retweeted_status']]
df['rt_user'] = [d.get('user') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]
df['rt_retweet_count'] = [d.get('retweet_count') if type(d) == dict else np.nan
                           for d in df['retweeted_status']]
df['rt_favorite_count'] = [d.get('favorite_count') if type(d) == dict else np.nan
                            for d in df['retweeted_status']]
df['rt_lang'] = [d.get('lang') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]

In [None]:
#Get nested dictionary within rt_user 
df['rt_user_id'] = [d.get('id') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_name'] = [d.get('name') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_screen_name'] = [d.get('screen_name') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_location'] = [d.get('location') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_follower_count'] = [d.get('followers_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_friends_count'] = [d.get('friends_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_listed_count'] = [d.get('listed_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_favorites_count'] = [d.get('favourites_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_statuses_count'] = [d.get('statuses_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_created_at'] = [d.get('created_at') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_description'] = [d.get('description') if type(d) == dict
                          else np.nan for d in df['rt_user']]

In [None]:
#pull out extended Tweets
df['ex_tw_full_text'] = [d.get('full_text') if type(d) == dict else np.nan
                          for d in df['extended_tweet']]
df['ex_tw_entities'] = [d.get('entities') if type(d) == dict else np.nan
                         for d in df['extended_tweet']]

In [None]:
#selecting dictionaries from user, nested dictionary
df['user_id'] = np.array([x['id'] for x in df['user']])
df['user_screen_name'] = np.array([x['screen_name'] for x in df['user']])
df['user_location'] = np.array([x['location'] for x in df['user']])
df['user_description'] = np.array([x['description'] for x in df['user']])
df['user_fol_count'] = np.array([x['followers_count'] for x in df['user']])
df['user_fr_count'] = np.array([x['friends_count'] for x in df['user']])
df['user_fav_count'] = np.array([x['favourites_count'] for x in df['user']])
df['user_status_count'] = np.array([x['statuses_count'] for x in df['user']])
df['user_created_at'] = np.array([x['created_at'] for x in df['user']])
df['user_listed_count'] = np.array([x['listed_count'] for x in df['user']])

In [None]:
#Clean up the source
from bs4 import BeautifulSoup
df['source'] = [BeautifulSoup(text).get_text() for text in df['source']]
#clean rt_source
df['rt_source']= df['rt_source'].replace(np.nan, '')
df['rt_source'] = [BeautifulSoup(text).get_text() if text != np.nan
                    else np.nan for text in df['rt_source']]

In [None]:
#change the date time column
df['created_at'] = pd.to_datetime(df['created_at'])
df['rt_created_at'] = pd.to_datetime(df['rt_created_at'])
df['rt_user_created_at'] = pd.to_datetime(df['rt_user_created_at'])

In [None]:
#Write regular or findall expression to pull out @ and # from rt_text, text and ex_text 
# #hashtags 
df['text_hashtags'] = df.text.str.findall(r'#.*?(?=\s|$)')
df['rt_text_hashtags'] = df.rt_text.str.findall(r'#.*?(?=\s|$)')
df['ex_tw_full_text_hashtags'] = df.ex_tw_full_text.str.findall(r'#.*?(?=\s|$)')

# @mentions
df['text_mentions'] = df.text.str.findall(r'@.*?(?=\s|$)')
df['rt_text_mentions'] = df.rt_text.str.findall(r'@.*?(?=\s|$)')
df['ex_tw_full_text_mentions'] = df.ex_tw_full_text.str.findall(r'@.*?(?=\s|$)')

In [None]:
#remove unecssary columns
df = df.drop(columns = {'rt_user','retweeted_status', 
                        'extended_tweet', 'user', 'entities',
                       'ex_tw_entities'})

In [None]:
#saving cleand data as a csv file 
df.to_csv('cleaned_sample_dis.csv',index = False)