# Preprocess Twitter Data

#### First, import libraries and declare variables


The outer dictionary would be of form

**{ tweets: list_of_tweets }** 

with each tweet being a dictionary with the form: 

**{ tweetID: { party_label: [text, [hashtags], [account_tags], boolean_retweet, account] }**

Thus creating the complete from of 

**{ tweets: [ { tweetID: { party_label: [text, [hashtags], [account_tags], boolean_retweet, account] } } ] }**


In [1]:
import pandas as pd
import string
import preprocessor as preprocess

final_data = {'tweets': []}

#### Import representative dataset as dataframe

In [2]:
rep_tweets_df = pd.read_csv("../datasets/ExtractedTweets.csv")
rep_tweets_df

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...
...,...,...,...
86455,Republican,RepTomPrice,Check out my op-ed on need for End Executive O...
86456,Republican,RepTomPrice,"Yesterday, Betty &amp; I had a great time lear..."
86457,Republican,RepTomPrice,We are forever grateful for the service and sa...
86458,Republican,RepTomPrice,Happy first day of school @CobbSchools! #CobbB...


#### Preprocess representative dataset and add to data dictionary

In [3]:
# Process first dataset
id = 0
for line in rep_tweets_df.iterrows():
    
    # update variables
    tweet = {}
    id += 1
    tweet_content = line[1]['Tweet']
    if not tweet_content:
        continue
    retweet = tweet_content.startswith('RT')
    handle = line[1]['Handle']
    party = line[1]['Party']
    hashtags = []
    mentions = []
    
    # get hashtags and mentions
    parsed_tweet = preprocess.parse(tweet_content)
    if parsed_tweet.hashtags:
        hashtags = [ht.match for ht in parsed_tweet.hashtags]
    if parsed_tweet.mentions:
        mentions = [m.match for m in parsed_tweet.mentions]
    
    # clean tweet for just words and make dictionary object
    words = preprocess.clean(tweet_content)
    tweet = {id: {party: [words.translate(str.maketrans('', '', string.punctuation)), hashtags, mentions, retweet, handle]}}
    final_data['tweets'].append(tweet)
    if id >= 5:
        break
final_data['tweets']

[{1: {'Democrat': ['Today Senate Dems vote to  Proud to support similar legislation here in the House',
    ['#SaveTheInternet', '#NetNeutrality'],
    [],
    False,
    'RepDarrenSoto']}},
 {2: {'Democrat': [' Winter Haven resident  Alta Vista teacher is one of several recognized by for National Teacher Apprecia',
    [],
    ['@WinterHavenSun', '@RepDarrenSoto'],
    True,
    'RepDarrenSoto']}},
 {3: {'Democrat': ['  noted that Hurricane Maria has left approximately 90 billion in damages Congress has allocated about 18',
    [],
    ['@NBCLatino', '@RepDarrenSoto'],
    True,
    'RepDarrenSoto']}},
 {4: {'Democrat': [' Meeting with  Thanks for taking the time to meet with ED Marucci Guzman ',
    ['#NALCABPolicy2018'],
    ['@NALCABPolicy', '@RepDarrenSoto', '@LatinoLeader'],
    True,
    'RepDarrenSoto']}},
 {5: {'Democrat': [' Hurricane season starts on June st Puerto Ricos readinesswell',
    [],
    ['@Vegalteno', '@Pwr4PuertoRico', '@RepDarrenSoto', '@EspaillatNY'],
    True

#### Import second dataset: tweets of trump and clinton during 2016 presidential campaign

In [4]:
candidate_tweets_df = pd.read_csv("../datasets/tweets.csv")
candidate_tweets_df

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6439,684219579548041218,realDonaldTrump,"""@lilredfrmkokomo: @realDonaldTrump My Faceboo...",False,,2016-01-05T03:47:14,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '26122621', 'nam...",
6440,684218836569206784,realDonaldTrump,"""@marybnall01: @realDonaldTrump watched lowell...",False,,2016-01-05T03:44:17,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '3477455725', 'n...",
6441,684218305100525569,realDonaldTrump,"""@ghosthunter_lol: Iowa key endorsement for @r...",False,,2016-01-05T03:42:10,,,,False,...,,,,,,,http://twitter.com/download/android,False,{'media': [{'display_url': 'pic.twitter.com/JB...,{'media': [{'display_url': 'pic.twitter.com/JB...
6442,684217554861199360,realDonaldTrump,"""@iLoveiDevices: @EdwinRo47796972 @happyjack22...",False,,2016-01-05T03:39:11,,,,False,...,,,,,,,http://twitter.com/download/android,False,"{'user_mentions': [{'id_str': '42568997', 'nam...",


#### Preprocess this dataset and add to data object

In [15]:
for line in candidate_tweets_df.iterrows():
    '''if line[1]['is_retweet'] and line[1]['handle'] == 'realDonaldTrump':
        print(line[1]['text'])
        print()'''
    if line[1]['text'].startswith('\"@') and line[1]['text'].endswith('\"'): # edge case of tweet surrounded with quotes
        print(line[1]['text'])

"@KellyannePolls: Trump is headed for a win, says professor who has predicted 30 years of presidential outcomes   https://t.co/68WEMcuHSO"
"@AngPiazza: @foxandfriends  @realDonaldTrump he's the ONLY candidate that will keep us safe!"
"@brimyers813: Saw ur speech on Twitter. U give me hope and optimism. I feel as though I am in the room with u. I pray 4 ur/our success."
"@Stvzbnk: Just Watched @tonyschwartz. Obviously Tony is a Total Whack Job @realDonaldTrump"
"@Ler: Message for undecided voters: Please wake up and vote DonaldTrump now! Trump/Pence very important save our America before too late!"
"@tweak626: I'm at a biker rally in Perry, Kansas...and everyone is a @realDonaldTrump fan. Love it."
"@ronnieclemmons: @ChrisCJackson @TakouiS @realDonaldTrump  Trump now leads her by 2 - get real, she will lose big"
"@lblackvelvet: @realDonaldTrump We need to show Americans that Hillary will KILL our Country !! Vote for Trump !!"
"@AnneBellar: @realDonaldTrump @CNN CNN is so biased. Never e

In [4]:
# clean the datasets
# store in data object as {Tweets: [{ tweetID: { party_label: [text, [hashtags], [account_tags], boolean_retweet, account] } } ] }
# want to have randomly sorted data objects and their correpsponding labels
# should we store as csv or dictionary?
# store as dict

In [None]:
# separate into training and testing data, based on random sorting