# Data Cleaning - Twitter

In [160]:
#import libraries

import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime

import pickle

In [154]:
dt_tweets_raw = pd.read_csv('data/trump_tweets.txt')

In [155]:
pd.set_option('display.max_colwidth',-1)  

In [156]:
dt_tweets = dt_tweets_raw.drop(['source', 'retweet_count', 'favorite_count', 'is_retweet', 'id_str'], axis=1)

In [157]:
#dt_tweets = dt_tweets.set_index('created_at')

**Cleaning tweets**

In [158]:
dt_tweets['text'] = dt_tweets['text'].apply(lambda x: clean_tweet(x))

In [164]:
#Change date object to datetime
dt_tweets['created_at'] = pd.to_datetime(dt_tweets['created_at'])

In [169]:
#create date column
dt_tweets['date'] = [d.date() for d in dt_tweets['created_at']]

In [172]:
#drop created by column 
dt_tweets = dt_tweets.drop(['created_at'], axis=1)

In [173]:
dt_tweets.head(10)

Unnamed: 0,text,date
0,Busy week planned with a heavy focus on jobs and national security Top executives coming in at 9 00 A M to talk manufacturing in America,2017-01-23
1,Will be meeting at 9 00 with top automobile executives concerning jobs in America I want new plants to be built here for cars sold here,2017-01-24
2,A photo delivered yesterday that will be displayed in the upper lower press hall Thank you Abbas,2017-01-24
3,Great meeting with automobile industry leaders at the this morning Together we will MAGA,2017-01-24
4,Signing orders to move forward with the construction of the Keystone XL and Dakota Access pipelines in the Oval Off,2017-01-24
5,Great meeting with Ford CEO Mark Fields and General Motors CEO Mary Barra at the today,2017-01-25
6,Congratulations to for being number one in inauguration ratings They were many times higher than FAKE NEWS public is smart,2017-01-25
7,If Chicago doesn t fix the horrible carnage going on 228 shootings in 2017 with 42 killings up 24 from 2016 I will send in the Feds,2017-01-25
8,Big day planned on NATIONAL SECURITY tomorrow Among many other things we will build the wall,2017-01-25
9,I will be asking for a major investigation into VOTER FRAUD including those registered to vote in two states those who are illegal and,2017-01-25


**Sentiment Analysis**

In [131]:
analyzer = SentimentIntensityAnalyzer()
sentiment = dt_tweets['text'].apply(lambda x: analyzer.polarity_scores(x))

In [132]:
dt_tweets=pd.concat([dt_tweets,sentiment.apply(pd.Series)],1)

In [133]:
dt_tweets.head(10)

Unnamed: 0_level_0,text,neg,neu,pos,compound
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-23 11:38:16,Busy week planned with a heavy focus on jobs and national security Top executives coming in at 9 00 A M to talk manufacturing in America,0.0,0.826,0.174,0.4939
2017-01-24 11:11:47,Will be meeting at 9 00 with top automobile executives concerning jobs in America I want new plants to be built here for cars sold here,0.0,0.876,0.124,0.2732
2017-01-24 16:58:06,A photo delivered yesterday that will be displayed in the upper lower press hall Thank you Abbas,0.118,0.749,0.134,0.0772
2017-01-24 17:04:01,Great meeting with automobile industry leaders at the this morning Together we will MAGA,0.0,0.76,0.24,0.6249
2017-01-24 17:49:17,Signing orders to move forward with the construction of the Keystone XL and Dakota Access pipelines in the Oval Off,0.0,1.0,0.0,0.0
2017-01-25 00:46:57,Great meeting with Ford CEO Mark Fields and General Motors CEO Mary Barra at the today,0.0,0.785,0.215,0.6249
2017-01-25 02:16:19,Congratulations to for being number one in inauguration ratings They were many times higher than FAKE NEWS public is smart,0.138,0.577,0.285,0.4708
2017-01-25 02:25:40,If Chicago doesn t fix the horrible carnage going on 228 shootings in 2017 with 42 killings up 24 from 2016 I will send in the Feds,0.258,0.742,0.0,-0.8402
2017-01-25 02:37:48,Big day planned on NATIONAL SECURITY tomorrow Among many other things we will build the wall,0.0,0.827,0.173,0.4824
2017-01-25 12:10:01,I will be asking for a major investigation into VOTER FRAUD including those registered to vote in two states those who are illegal and,0.289,0.711,0.0,-0.8455


**Categorizing Tweets**

In [134]:
dt_tweets['sentiment'] = np.where(dt_tweets['compound']<0, 'negative', 'positive')
        

In [135]:
sentiment_count = dt_tweets.groupby('sentiment').size()

In [140]:
dt_tweets['neg_dv'] = np.where(dt_tweets['compound']<0, 1, 0)

In [141]:
dt_tweets.tail(5)

Unnamed: 0_level_0,text,neg,neu,pos,compound,sentiment,neg_dv
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-19 02:23:46,RT By the time their fake news campaigns are fully exposed they ve already moved on to their next fake news campaign A g,0.291,0.709,0.0,-0.7778,negative,1
2019-01-19 03:02:50,Remember it was Buzzfeed that released the totally discredited Dossier paid for by Crooked Hillary Clinton and the Democrats as opposition research on which the entire Russian probe is based A very sad day for journalism but a great day for our Country,0.09,0.793,0.118,0.5199,positive,0
2019-01-19 03:22:29,RT At what point in fairness after 2 years do Americans of good will say enough already If the SpecialCounsel had collusi,0.0,0.873,0.127,0.4404,positive,0
2019-01-19 03:22:44,RT This is just the most egregious example of the rampant unfairness that has tainted this partisan witch hunt from the beg,0.106,0.894,0.0,-0.3612,negative,1
2019-01-19 03:24:49,Fake News is truly the ENEMY OF THE PEOPLE,0.452,0.37,0.179,-0.6633,negative,1


In [142]:
#save dataframe as pickle file

with open('dt_tweets.pickle', 'wb') as to_write:
    pickle.dump(dt_tweets, to_write)