In [1]:
import pandas as pd
import re

## Read in Data

In [2]:
dem2012 = pd.read_csv('data/obama2012.csv', dtype=str, engine='python')
rep2012 = pd.read_csv('data/romney2012.csv', dtype=str, engine='python')
assert dem2012.columns.equals(rep2012.columns), 'cols different in 2012'

In [3]:
dem2016 = pd.read_csv('data/clinton2016.csv', dtype=str, engine='python')
rep2016 = pd.read_csv('data/trump2016.csv', dtype=str, engine='python')
assert dem2016.columns.equals(rep2016.columns), 'cols different in 2016'

In [4]:
dem2020 = pd.read_csv('data/hashtag_joebiden.csv', dtype=str, engine='python')
rep2020 = pd.read_csv('data/hashtag_donaldtrump.csv', dtype=str, engine='python')
assert dem2020.columns.equals(rep2020.columns), 'cols different in 2020'

In [None]:
def deal_with_data(df: pd.DataFrame, dropcols=[], droprows=[], replace={}) -> pd.DataFrame:
    """ Edits DataFrame of Tweets given parameters for a particular year """
    for col in dropcols:
        df.drop(col, axis=1, inplace=True)
    
    for col in droprows:
        df.dropna(subset=[col], inplace=True)
        
    
    for col, replacement_val in replace.items():
        print(col, replacement_val)
        
        if isinstance(replacement_val, list):
            df[col] = df[col].apply(lambda x: replacement_val if pd.isna(x) else x)
        elif replacement_val == 'average':
            df[col] = pd.to_numeric(df[col], errors='coerce')
            avg = df[col].mean()
            df[col] = df[col].fillna(avg)
        else:
            df[col] = df[col].fillna(replacement_val)
        print('done^')
    
    if len(list(df.columns)) == 5:
        df['Hashtags'] = ['#'] * len(df)
    
    df.columns = ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
    return df

def extract_hashtags(tweet):
    return re.findall(r"#\w+", tweet)

# Clean Tweets for each year and each party

# 2012

In [6]:
dropcols2012 = ['ReplyCount', 'ViewCount', 'User', 'url', 'Place']
replace2012 = {
    'Hashtags': [],
    'LikeCount': 'average',
    'RetweetCount': 'average',
    'STATE': 'Unknown'
}

## Democrat

In [7]:
dem2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99089 entries, 0 to 99088
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          99089 non-null  object
 1   User          99066 non-null  object
 2   Tweets        99066 non-null  object
 3   LikeCount     99024 non-null  object
 4   RetweetCount  98982 non-null  object
 5   ReplyCount    99024 non-null  object
 6   ViewCount     42 non-null     object
 7   Place         96963 non-null  object
 8   STATE         99024 non-null  object
 9   Hashtags      32697 non-null  object
 10  url           98982 non-null  object
dtypes: object(11)
memory usage: 8.3+ MB


In [8]:
dem2012 = deal_with_data(dem2012, dropcols=dropcols2012, droprows=['Tweets'], replace=replace2012)

Hashtags []
done^
LikeCount average
done^
RetweetCount average
done^
STATE Unknown
done^


In [9]:
dem2012.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99066 entries, 0 to 99088
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CreatedAt     99066 non-null  object 
 1   Tweets        99066 non-null  object 
 2   LikeCount     99066 non-null  float64
 3   RetweetCount  99066 non-null  float64
 4   State         99066 non-null  object 
 5   Hashtags      99066 non-null  object 
dtypes: float64(2), object(4)
memory usage: 5.3+ MB


## Republican

In [10]:
rep2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100676 entries, 0 to 100675
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Date          100676 non-null  object
 1   User          100648 non-null  object
 2   Tweets        100648 non-null  object
 3   LikeCount     100606 non-null  object
 4   RetweetCount  100564 non-null  object
 5   ReplyCount    100606 non-null  object
 6   ViewCount     42 non-null      object
 7   Place         98079 non-null   object
 8   STATE         100606 non-null  object
 9   Hashtags      35967 non-null   object
 10  url           100564 non-null  object
dtypes: object(11)
memory usage: 8.4+ MB


In [11]:
rep2012 = deal_with_data(rep2012, dropcols=dropcols2012, droprows=['Tweets'], replace=replace2012)

Hashtags []
done^
LikeCount average
done^
RetweetCount average
done^
STATE Unknown
done^


In [12]:
rep2012.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100648 entries, 0 to 100675
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CreatedAt     100648 non-null  object 
 1   Tweets        100648 non-null  object 
 2   LikeCount     100648 non-null  float64
 3   RetweetCount  100648 non-null  float64
 4   State         100648 non-null  object 
 5   Hashtags      100648 non-null  object 
dtypes: float64(2), object(4)
memory usage: 5.4+ MB


# 2016

## Democrat

In [13]:
dropcols2016 = ['ReplyCount', 'ViewCount', 'User', 'url', 'Place']
replace2016 = {
    'Hashtags': [],
    'LikeCount': 'average',
    'RetweetCount': 'average',
    'STATE': 'Unknown'
}

In [14]:
dem2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279562 entries, 0 to 279561
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Date          279562 non-null  object
 1   User          279562 non-null  object
 2   Tweets        279562 non-null  object
 3   LikeCount     279562 non-null  object
 4   RetweetCount  279562 non-null  object
 5   ReplyCount    279562 non-null  object
 6   ViewCount     0 non-null       object
 7   Place         262226 non-null  object
 8   STATE         279562 non-null  object
 9   Hashtags      88990 non-null   object
 10  url           279562 non-null  object
dtypes: object(11)
memory usage: 23.5+ MB


In [15]:
dem2016 = deal_with_data(dem2016, dropcols=dropcols2016, droprows=['Tweets'], replace=replace2016)

Hashtags []
done^
LikeCount average
done^
RetweetCount average
done^
STATE Unknown
done^


In [16]:
dem2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279562 entries, 0 to 279561
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CreatedAt     279562 non-null  object
 1   Tweets        279562 non-null  object
 2   LikeCount     279562 non-null  int64 
 3   RetweetCount  279562 non-null  int64 
 4   State         279562 non-null  object
 5   Hashtags      279562 non-null  object
dtypes: int64(2), object(4)
memory usage: 12.8+ MB


## Republican

In [17]:
rep2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817320 entries, 0 to 817319
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Date          817320 non-null  object
 1   User          817320 non-null  object
 2   Tweets        817320 non-null  object
 3   LikeCount     817320 non-null  object
 4   RetweetCount  817320 non-null  object
 5   ReplyCount    817320 non-null  object
 6   ViewCount     0 non-null       object
 7   Place         758953 non-null  object
 8   STATE         817320 non-null  object
 9   Hashtags      236384 non-null  object
 10  url           817320 non-null  object
dtypes: object(11)
memory usage: 68.6+ MB


In [18]:
rep2016 = deal_with_data(rep2016, dropcols=dropcols2016, droprows=['Tweets'], replace=replace2016)

Hashtags []
done^
LikeCount average
done^
RetweetCount average
done^
STATE Unknown
done^


In [19]:
rep2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817320 entries, 0 to 817319
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CreatedAt     817320 non-null  object
 1   Tweets        817320 non-null  object
 2   LikeCount     817320 non-null  int64 
 3   RetweetCount  817320 non-null  int64 
 4   State         817320 non-null  object
 5   Hashtags      817320 non-null  object
dtypes: int64(2), object(4)
memory usage: 37.4+ MB


# 2020

In [20]:
dropcols2020 = ['tweet_id', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description',
               'user_followers_count', 'lat', 'long', 'city', 'continent', 'country',
               'user_join_date', 'collected_at', 'state_code', 'user_location']

replace2020 = {'state': 'Unknown', 'retweet_count': 0, 'likes': 'average'}

## Democrat

In [21]:
dem2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777073 entries, 0 to 777072
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   created_at            777073 non-null  object
 1   tweet_id              776995 non-null  object
 2   tweet                 776995 non-null  object
 3   likes                 776914 non-null  object
 4   retweet_count         776895 non-null  object
 5   source                776182 non-null  object
 6   user_id               776889 non-null  object
 7   user_name             776877 non-null  object
 8   user_screen_name      776895 non-null  object
 9   user_description      694885 non-null  object
 10  user_join_date        776784 non-null  object
 11  user_followers_count  776885 non-null  object
 12  user_location         543066 non-null  object
 13  lat                   355284 non-null  object
 14  long                  355284 non-null  object
 15  city             

In [22]:
dem2020 = deal_with_data(dem2020, dropcols=dropcols2020, droprows=['tweet'], replace=replace2020)
dem2020['Hashtags'] = dem2020['Tweets'].astype(str).apply(extract_hashtags)

state Unknown
done^
retweet_count 0
done^
likes average
done^


In [23]:
dem2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 776995 entries, 0 to 777072
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CreatedAt     776995 non-null  object 
 1   Tweets        776995 non-null  object 
 2   LikeCount     776995 non-null  float64
 3   RetweetCount  776995 non-null  object 
 4   State         776995 non-null  object 
 5   Hashtags      776995 non-null  object 
dtypes: float64(1), object(5)
memory usage: 41.5+ MB


## Republican

In [24]:
rep2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 971087 entries, 0 to 971086
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   created_at            971087 non-null  object
 1   tweet_id              971073 non-null  object
 2   tweet                 971073 non-null  object
 3   likes                 971045 non-null  object
 4   retweet_count         970933 non-null  object
 5   source                970057 non-null  object
 6   user_id               970929 non-null  object
 7   user_name             970917 non-null  object
 8   user_screen_name      970933 non-null  object
 9   user_description      869663 non-null  object
 10  user_join_date        970779 non-null  object
 11  user_followers_count  970917 non-null  object
 12  user_location         675839 non-null  object
 13  lat                   445702 non-null  object
 14  long                  445705 non-null  object
 15  city             

In [25]:
rep2020 = deal_with_data(rep2020, dropcols=dropcols2020, droprows=['tweet'], replace=replace2020)
rep2020['Hashtags'] = rep2020['Tweets'].astype(str).apply(extract_hashtags)

state Unknown
done^
retweet_count 0
done^
likes average
done^


In [26]:
rep2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 971073 entries, 0 to 971086
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CreatedAt     971073 non-null  object 
 1   Tweets        971073 non-null  object 
 2   LikeCount     971073 non-null  float64
 3   RetweetCount  971073 non-null  object 
 4   State         971073 non-null  object 
 5   Hashtags      971073 non-null  object 
dtypes: float64(1), object(5)
memory usage: 51.9+ MB


# Check columns

In [27]:
print(f'dem 2012: {list(dem2012.columns)}')
print(f'rep 2012: {list(rep2012.columns)}')
print(f'dem 2016: {list(dem2016.columns)}')
print(f'rep 2016: {list(rep2016.columns)}')
print(f'dem 2020: {list(dem2020.columns)}')
print(f'rep 2020: {list(rep2020.columns)}')

dem 2012: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
rep 2012: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
dem 2016: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
rep 2016: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
dem 2020: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']
rep 2020: ['CreatedAt', 'Tweets', 'LikeCount', 'RetweetCount', 'State', 'Hashtags']


# Save Files

In [28]:
for year in [2012, 2016, 2020]:
    exec(f'dem{year}.to_csv("data/cleaned_dem{year}.csv", index=False)')
    exec(f'rep{year}.to_csv("data/cleaned_rep{year}.csv", index=False)')