# Data Cleaning

This notebook is used to clean the data for the State Election Officials social media posts.

In [7]:
# read csv
import pandas as pd

df1 = pd.read_csv('CEO Nov 2022 posts_1743554102.csv')
df2 = pd.read_csv('CEO Oct 2022 posts_1743554102.csv')
df3 = pd.read_csv('CEO Sept 2022 posts_1743554102.csv')

In [8]:
df1.head()

Unnamed: 0,PostId,PostUrl,PostEngagement,Platform,ChannelID,ChannelName,ChannelUid,ChannelUrl,ChannelEngagement,post_body_text,...,published_at,post_data,post_media_urls,LikesCount,SharesCount,CommentsCount,ViewsCount,post_media_file,embedded_post_text,search_data
0,80074859,https://www.instagram.com/p/ClmjqIFveun/,,Instagram,9955344,John Merrill,blank_for_now,blank_for_now,"{""follower_count"":3406,""following_count"":0,""li...",While in Camden today I had the privilege to v...,...,2022-11-30T22:49:47.000Z,post data removed,https://www.junkipedia.org/rails/active_storag...,13,0,0,0.0,,,
1,80069324,https://www.instagram.com/p/ClmdWDUPnPO/,,Instagram,9955344,John Merrill,blank_for_now,blank_for_now,"{""follower_count"":3406,""following_count"":0,""li...",I had a great visit this afternoon with the Wi...,...,2022-11-30T21:54:37.000Z,post data removed,https://www.junkipedia.org/rails/active_storag...,7,0,0,0.0,,,
2,80072838,https://www.facebook.com/100057729714234/posts...,,Facebook,8894188,Secretary of State Steve Simon,blank_for_now,blank_for_now,"{""follower_count"":3963,""following_count"":0,""li...","On November 29, the State Canvassing Board cer...",...,2022-11-30T21:45:30.000Z,post data removed,,4,2,0,0.0,,,
3,80076411,https://twitter.com/MNSecofState/status/159807...,,Twitter,8891709,Minnesota Secretary of State,blank_for_now,blank_for_now,"{""follower_count"":4380,""following_count"":330,""...",The State Canvassing Board certified the 2022 ...,...,2022-11-30T21:44:00.000Z,post data removed,,1,1,0,0.0,,,
4,80107630,https://twitter.com/md_sbe/status/159807020264...,,Twitter,8891704,Maryland Elections,blank_for_now,blank_for_now,"{""follower_count"":4867,""following_count"":231,""...",RT @BaltCoElections: To all Baltimore County E...,...,2022-11-30T21:43:21.000Z,post data removed,,0,3,0,0.0,,,


In [9]:
# number of rows in each dataframe
print(len(df1))
print(len(df2))
print(len(df3))

3983
3864
2027


In [10]:
# Keep only the columns we need: PostId, post_body_text, GoogleAudioText, VoskAudioText, EmbeddedContentText
df1 = df1[['PostId', 'post_body_text', 'GoogleAudioText', 'EmbeddedContentText', 'Platform', 'ChannelID']]
df2 = df2[['PostId', 'post_body_text', 'GoogleAudioText', 'EmbeddedContentText', 'Platform', 'ChannelID']]
df3 = df3[['PostId', 'post_body_text', 'GoogleAudioText', 'EmbeddedContentText', 'Platform', 'ChannelID']]
# Concatenate the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)
df.head()

Unnamed: 0,PostId,post_body_text,GoogleAudioText,EmbeddedContentText,Platform,ChannelID
0,80074859,While in Camden today I had the privilege to v...,,,Instagram,9955344
1,80069324,I had a great visit this afternoon with the Wi...,,,Instagram,9955344
2,80072838,"On November 29, the State Canvassing Board cer...",,,Facebook,8894188
3,80076411,The State Canvassing Board certified the 2022 ...,,,Twitter,8891709
4,80107630,RT @BaltCoElections: To all Baltimore County E...,,,Twitter,8891704


In [11]:
# unique GoogleAudioText
df['GoogleAudioText'].unique()

array([' ',
       " every single republican in congress voted against this bill but against lower prescription drug prices against lowering healthcare costs against the fair taxes every single republican every single one voted against tackling the climate crisis against lowering our energy costs against creating good paying jobs my fellow americans that's the choice we face face democrats sided with the american people and every single republican in congress sided with the special interests in this boat"],
      dtype=object)

In [12]:
# make ' ' to NaN in GoogleAudioText
df['GoogleAudioText'] = df['GoogleAudioText'].replace(' ', pd.NA)
# make ' ' to NaN in EmbeddedContentText
df['EmbeddedContentText'] = df['EmbeddedContentText'].replace(' ', pd.NA)

# replace NaN with empty string
df['GoogleAudioText'] = df['GoogleAudioText'].fillna('')
df['EmbeddedContentText'] = df['EmbeddedContentText'].fillna('')

In [13]:
# make extra column CombinedText with post_body_text and GoogleAudioText and EmbeddedContentText
df['CombinedText'] = df['post_body_text'] + ' ' + df['GoogleAudioText'] + ' ' + df['EmbeddedContentText']
# # make ' ' to NaN in CombinedText
# df['CombinedText'] = df['CombinedText'].replace(' ', pd.NA)

# # make '  ' to ' ', remove extra spaces
# df['CombinedText'] = df['CombinedText'].replace('  ', ' ')

In [14]:
df.head()

Unnamed: 0,PostId,post_body_text,GoogleAudioText,EmbeddedContentText,Platform,ChannelID,CombinedText
0,80074859,While in Camden today I had the privilege to v...,,,Instagram,9955344,While in Camden today I had the privilege to v...
1,80069324,I had a great visit this afternoon with the Wi...,,,Instagram,9955344,I had a great visit this afternoon with the Wi...
2,80072838,"On November 29, the State Canvassing Board cer...",,,Facebook,8894188,"On November 29, the State Canvassing Board cer..."
3,80076411,The State Canvassing Board certified the 2022 ...,,,Twitter,8891709,The State Canvassing Board certified the 2022 ...
4,80107630,RT @BaltCoElections: To all Baltimore County E...,,,Twitter,8891704,RT @BaltCoElections: To all Baltimore County E...


In [15]:
# write to csv
df.to_csv('text_2022.csv', index=False)

In [16]:
# read csv

df_24 = pd.read_csv('2024_No_Lables.csv')

In [17]:
df_24.head()

Unnamed: 0,PostId,PostUrl,PostEngagement,Platform,ChannelID,ChannelName,ChannelUid,ChannelUrl,ChannelEngagement,post_body_text,...,published_at,post_data,post_media_urls,LikesCount,SharesCount,CommentsCount,ViewsCount,post_media_file,embedded_post_text,search_data
0,392353290,https://twitter.com/IowaSOS/status/18677126733...,,Twitter,8403193,Iowa Secretary of State Paul Pate,blank_for_now,blank_for_now,"{""follower_count"":10134,""following_count"":2397...",Happy Birthday to the @USNationalGuard @Nation...,...,2024-12-13T23:26:16.000Z,post data removed,,0.0,0.0,2,193,,,
1,392438642,https://www.facebook.com/IASecretaryofState/po...,,Facebook,8402424,Iowa Secretary of State Paul Pate,blank_for_now,blank_for_now,"{""follower_count"":12966,""following_count"":0,""l...","Happy Birthday to the National Guard! Today, w...",...,2024-12-13T23:26:15.000Z,post data removed,,7.0,0.0,0,0,,,
2,392438644,https://www.facebook.com/reel/601785225881752/,,Facebook,8402424,Iowa Secretary of State Paul Pate,blank_for_now,blank_for_now,"{""follower_count"":12966,""following_count"":0,""l...",Moments of harmony in the Iowa State Capitol. ...,...,2024-12-13T21:48:01.000Z,post data removed,,27.0,9.0,0,205,https://www.junkipedia.org/rails/active_storag...,,
3,392552204,https://www.instagram.com/p/DDiL9SSOtDX/,,InstagramDirect,26302501,Paul Pate,blank_for_now,blank_for_now,"{""follower_count"":1517,""following_count"":697,""...",Moments of harmony in the Iowa State Capitol. ...,...,2024-12-13T21:47:53.000Z,post data removed,,13.0,0.0,0,58,,,
4,392445383,https://www.facebook.com/ARSecofState/posts/pf...,,Facebook,8894196,Arkansas Secretary of State John Thurston,blank_for_now,blank_for_now,"{""follower_count"":5970,""following_count"":0,""li...",,...,2024-12-13T21:06:10.000Z,post data removed,,0.0,0.0,0,0,,"For scheduled maintenance, the Capitol will be...",


In [18]:
len(df_24)

13645

In [19]:
# Keep only the columns we need: PostId, post_body_text, GoogleAudioText, EmbeddedContentText
df_24 = df_24[['PostId', 'post_body_text', 'GoogleAudioText', 'EmbeddedContentText', 'Platform', 'ChannelID']]

# make ' ' to NaN in GoogleAudioText
df_24['GoogleAudioText'] = df_24['GoogleAudioText'].replace(' ', pd.NA)
# make ' ' to NaN in EmbeddedContentText
df_24['EmbeddedContentText'] = df_24['EmbeddedContentText'].replace(' ', pd.NA)

# replace NaN with empty string
df_24['GoogleAudioText'] = df_24['GoogleAudioText'].fillna('')
df_24['EmbeddedContentText'] = df_24['EmbeddedContentText'].fillna('')

df_24['CombinedText'] = df_24['post_body_text'] + ' ' + df_24['GoogleAudioText'] + ' ' + df_24['EmbeddedContentText']

In [20]:
df_24.head()

Unnamed: 0,PostId,post_body_text,GoogleAudioText,EmbeddedContentText,Platform,ChannelID,CombinedText
0,392353290,Happy Birthday to the @USNationalGuard @Nation...,,,Twitter,8403193,Happy Birthday to the @USNationalGuard @Nation...
1,392438642,"Happy Birthday to the National Guard! Today, w...",,,Facebook,8402424,"Happy Birthday to the National Guard! Today, w..."
2,392438644,Moments of harmony in the Iowa State Capitol. ...,A-l-a-l-a-l-a-l-a-l-a-l-a-l-a-l-a-l-a-l-a-l-a...,,Facebook,8402424,Moments of harmony in the Iowa State Capitol. ...
3,392552204,Moments of harmony in the Iowa State Capitol. ...,"Ae, ae, ae, ae, ae, ae, ae, ae, ae, ae, ae, a...",,InstagramDirect,26302501,Moments of harmony in the Iowa State Capitol. ...
4,392445383,,,"For scheduled maintenance, the Capitol will be...",Facebook,8894196,


In [21]:
df_24.to_csv('text_2024.csv', index=False)