###### Import all modules used to clean the data
* pandas
* numpy
* warnings
* bs4
* pd.set_options to see the rows. 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)
pd.options.display.float_format = '{:,.0f}'.format

###### Load in json file 


In [2]:
file = 'disresort3.json'
df = pd.read_json(file,lines = True, orient= 'columns')

###### Take columns we need to do sentiment analysis 

In [3]:
df = df[['text','id',
          'lang','created_at',
          'user','source','retweeted_status',
          'extended_tweet', 'entities']]

###### Accessing retweeted status nested dictionary 

In [4]:
df['rt_created_at'] = [d.get('created_at') if type(d) == dict else np.nan
                        for d in df['retweeted_status']]
df['rt_id'] =  [d.get('id') if type(d) == dict else np.nan
                 for d in df['retweeted_status']]
df['rt_text'] = [d.get('text') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]
df['rt_source'] = [d.get('source') if type(d) == dict else np.nan
                    for d in df['retweeted_status']]
df['rt_user'] = [d.get('user') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]
df['rt_retweet_count'] = [d.get('retweet_count') if type(d) == dict else np.nan
                           for d in df['retweeted_status']]
df['rt_favorite_count'] = [d.get('favorite_count') if type(d) == dict else np.nan
                            for d in df['retweeted_status']]
df['rt_lang'] = [d.get('lang') if type(d) == dict else np.nan
                  for d in df['retweeted_status']]

In [5]:
df['rt_user_id'] = [d.get('id') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_name'] = [d.get('name') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_screen_name'] = [d.get('screen_name') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_location'] = [d.get('location') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_follower_count'] = [d.get('followers_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_friends_count'] = [d.get('friends_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_listed_count'] = [d.get('listed_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_favorites_count'] = [d.get('favourites_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_statuses_count'] = [d.get('statuses_count') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_created_at'] = [d.get('created_at') if type(d) == dict
                          else np.nan for d in df['rt_user']]
df['rt_user_description'] = [d.get('description') if type(d) == dict
                          else np.nan for d in df['rt_user']]

###### Accessing Extended Tweets Nested Dictionary 

In [6]:
df['ex_tw_full_text'] = [d.get('full_text') if type(d) == dict else np.nan
                          for d in df['extended_tweet']]
df['ex_tw_entities'] = [d.get('entities') if type(d) == dict else np.nan
                         for d in df['extended_tweet']]

###### Accessing User Nested Dictionary

In [7]:
df['user_id'] = [d.get('id') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_screen_name'] = [d.get('screen_name') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_location'] = [d.get('location') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_description'] = [d.get('description') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_fol_count'] = [d.get('followers_count') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_fr_count'] = [d.get('friends_count') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_fav_count'] = [d.get('favourites_count') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_status_count'] = [d.get('statuses_count') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_created_at'] = [d.get('created_at') if type(d) == dict else np.nan
                          for d in df['user']]
df['user_listed_count'] = [d.get('listed_count') if type(d) == dict else np.nan
                          for d in df['user']]

###### Cleaning up retweet source and source

In [8]:
df['source'] = [BeautifulSoup(text).get_text() for text in df['source']]
df['rt_source']= df['rt_source'].replace(np.nan, '')
df['rt_source'] = [BeautifulSoup(text).get_text() if text != np.nan
                    else np.nan for text in df['rt_source']]

###### Convert date time columns 
* created_at 
* retweet created_at 
* retweet user created_at
* user created_at

In [9]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['rt_created_at'] = pd.to_datetime(df['rt_created_at'])
df['rt_user_created_at'] = pd.to_datetime(df['rt_user_created_at'])
df['user_created_at'] = pd.to_datetime(df['user_created_at'])

###### Regular Expression to find all the @ mentions and hashtags 
* retweet text
* extended text
* text 

In [10]:
df['text_hashtags'] = df.text.str.findall(r'(#.*?(?=\s|$)| # .*?(?=\s|$))')
df['rt_text_hashtags'] = df.rt_text.str.findall(r'(#.*?(?=\s|$)| # .*?(?=\s|$))')
df['ex_tw_full_text_hashtags'] = df.ex_tw_full_text.str.findall(r'(#.*?(?=\s|$)| # .*?(?=\s|$))')
df['text_mentions'] = df.text.str.findall(r'(@.*?(?=\s|$)| @ .*?(?=\s|$))')
df['rt_text_mentions'] = df.rt_text.str.findall(r'(@.*?(?=\s|$)| @ .*?(?=\s|$))')
df['ex_tw_full_text_mentions'] = df.ex_tw_full_text.str.findall(r'(@.*?(?=\s|$)| @ .*?(?=\s|$))')                                        

###### Extracting Emojis
* Text
* Extended Text
* Retweet Text

In [11]:
import emoji
import re
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
df['text_emoji'] = df['text'].str.findall(r)
df['ex_tw_full_text_emoji'] = df['ex_tw_full_text'].str.findall(r)
df['rt_text_emoji'] = df['rt_text'].str.findall(r)

###### Drop Columns not used and remove brackets commas and colons 

In [12]:
df = df.drop(columns = {'rt_user','retweeted_status', 
                        'extended_tweet', 'user', 'entities',
                       'ex_tw_entities'})
df = df.replace([],'')
df = df.replace('[]','')
df = df.replace(np.nan,'')
df['text_hashtags']=df['text_hashtags'].replace(',','')
df['text_hashtags']=df['text_hashtags'].replace(':','')
df['text_mentions']=df['text_mentions'].replace(',','')
df['text_mentions']=df['text_mentions'].replace(':','')

###### Save Cleaned Data to Csv File 

In [13]:
df.to_csv('sample_dis1.csv',index = False)