# Preprocess Twitter Data

#### First, import libraries and declare variables


The outer dictionary would be of form

**{ tweets: list_of_tweets }** 

with each tweet being a list with the form: 

** [ party_label, text, [hashtags], [account_tags], boolean_retweet, account ] **

Thus creating the complete from of 

**{ tweets: [ [ party_label: text, [hashtags], [account_tags], boolean_retweet, account ] ]  }**


In [12]:
import pandas as pd
import pickle
import random
import string
import preprocessor as preprocess

final_data = {'tweets': []}

#### Import representative dataset as dataframe

In [13]:
rep_tweets_df = pd.read_csv("../data/raw/ExtractedTweets.csv")
rep_tweets_df

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...
...,...,...,...
86455,Republican,RepTomPrice,Check out my op-ed on need for End Executive O...
86456,Republican,RepTomPrice,"Yesterday, Betty &amp; I had a great time lear..."
86457,Republican,RepTomPrice,We are forever grateful for the service and sa...
86458,Republican,RepTomPrice,Happy first day of school @CobbSchools! #CobbB...


#### Preprocess representative dataset and add to data dictionary

In [14]:
# Process first dataset
for line in rep_tweets_df.iterrows():
    
    # update variables
    tweet = []
    tweet_content = line[1]['Tweet']
    if not tweet_content:
        continue
    retweet = tweet_content.startswith('RT')
    handle = line[1]['Handle']
    party = line[1]['Party']
    hashtags = []
    mentions = []
    
    # get hashtags and mentions
    parsed_tweet = preprocess.parse(tweet_content)
    if parsed_tweet.hashtags:
        hashtags = [ht.match for ht in parsed_tweet.hashtags]
    if parsed_tweet.mentions:
        mentions = [m.match for m in parsed_tweet.mentions]
    
    # clean tweet for just words and make dictionary object
    words = preprocess.clean(tweet_content)
    # check if more than one letter
    if not words:
        continue
    tweet = [party, words.translate(str.maketrans('', '', string.punctuation)), hashtags, mentions, retweet, handle]
    final_data['tweets'].append(tweet)

print('Done with dataset 1')

Done with dataset 1


#### Import second dataset: tweets of trump and clinton during 2016 presidential campaign

In [15]:
candidate_tweets_df = pd.read_csv("../data/raw/tweets.csv")
candidate_tweets_df

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6439,684219579548041218,realDonaldTrump,"""@lilredfrmkokomo: @realDonaldTrump My Faceboo...",False,,2016-01-05T03:47:14,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '26122621', 'nam...",
6440,684218836569206784,realDonaldTrump,"""@marybnall01: @realDonaldTrump watched lowell...",False,,2016-01-05T03:44:17,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '3477455725', 'n...",
6441,684218305100525569,realDonaldTrump,"""@ghosthunter_lol: Iowa key endorsement for @r...",False,,2016-01-05T03:42:10,,,,False,...,,,,,,,http://twitter.com/download/android,False,{'media': [{'display_url': 'pic.twitter.com/JB...,{'media': [{'display_url': 'pic.twitter.com/JB...
6442,684217554861199360,realDonaldTrump,"""@iLoveiDevices: @EdwinRo47796972 @happyjack22...",False,,2016-01-05T03:39:11,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '42568997', 'nam...",


#### Preprocess this dataset and add to data object

In [17]:
for line in candidate_tweets_df.iterrows():

    # update variables
    tweet = []
    tweet_content = line[1]['text']
    if not tweet_content:
        continue
    retweet = line[1]['is_retweet']
    handle = line[1]['handle']
    party = 'Democrat' if handle == 'HillaryClinton' else 'Republican'
    hashtags = []
    mentions = []
    
    # get hashtags and mentions
    parsed_tweet = preprocess.parse(tweet_content)
    if parsed_tweet.hashtags:
        hashtags = [ht.match for ht in parsed_tweet.hashtags]
    if parsed_tweet.mentions:
        mentions = [m.match for m in parsed_tweet.mentions]
    
    # clean tweet for just words and make dictionary object
    words = preprocess.clean(tweet_content)
    # check if more than one letter
    if not words:
        continue
        
    # get retweet for some tweets that are surrounded by quotes
    if words.startswith('/":"'):
        retweet = True
    tweet = [party, words.translate(str.maketrans('', '', string.punctuation)), hashtags, mentions, retweet, handle]
    final_data['tweets'].append(tweet)

print('Done with dataset 2')

Done with dataset 2


#### Separate into training and testing data

In [18]:
# define percent to be testing data
percent_testing = 0.2
testing_data = {'tweets': []}

length = len(final_data['tweets'])
testing_amount = int( percent_testing * length )

for t in range(0, testing_amount):
    length = len(final_data['tweets'])
    n = random.randint(0,length-1)
    tw = final_data['tweets'].pop(n)
    testing_data['tweets'].append(tw)
print('Done seperating')

Done seperating


#### Save data objects as pickle files

In [20]:
with open('../data/testing_data.obj', 'wb') as testing_file:
    pickle.dump(testing_data, testing_file)

with open('../data/training_data.obj', 'wb') as training_file:
    pickle.dump(final_data, training_file)
    
print('Complete')

Complete
