# Data Merging

This notebook is used to merge the data from the different years.

In [13]:
import pandas as pd
import numpy as np

In [14]:
df1 = pd.read_csv('../raw/text_2022.csv')
df2 = pd.read_csv('../raw/text_2024.csv')

In [15]:
df1.head(2)

Unnamed: 0,PostId,post_body_text,GoogleAudioText,EmbeddedContentText,Platform,ChannelID,CombinedText
0,80074859,While in Camden today I had the privilege to v...,,,Instagram,9955344,While in Camden today I had the privilege to v...
1,80069324,I had a great visit this afternoon with the Wi...,,,Instagram,9955344,I had a great visit this afternoon with the Wi...


In [16]:
# NaN values in the 'post_body_text' column
df1['post_body_text'].isna().sum()

859

In [17]:
df2.head(2)

Unnamed: 0,PostId,post_body_text,GoogleAudioText,EmbeddedContentText,Platform,ChannelID,CombinedText
0,392353290,Happy Birthday to the @USNationalGuard @Nation...,,,Twitter,8403193,Happy Birthday to the @USNationalGuard @Nation...
1,392438642,"Happy Birthday to the National Guard! Today, w...",,,Facebook,8402424,"Happy Birthday to the National Guard! Today, w..."


In [18]:
# NaN values in the 'post_body_text' column
df2['post_body_text'].isna().sum()

236

In [19]:
# keep only post_body_text and EmbeddedContentText
df1 = df1[['PostId','post_body_text', 'EmbeddedContentText', 'Platform', 'ChannelID']]
df2 = df2[['PostId','post_body_text', 'EmbeddedContentText', 'Platform', 'ChannelID']]

df1['EmbeddedContentText'] = df1['EmbeddedContentText'].replace(' ', pd.NA)
df1['EmbeddedContentText'] = df1['EmbeddedContentText'].fillna('')

df2['EmbeddedContentText'] = df2['EmbeddedContentText'].replace(' ', pd.NA)
df2['EmbeddedContentText'] = df2['EmbeddedContentText'].fillna('')

# make column named POS_tagging_text
df1['POS_tagging_text'] = df1['post_body_text'] + df1['EmbeddedContentText']
df2['POS_tagging_text'] = df2['post_body_text'] + df2['EmbeddedContentText']

In [20]:
# add one df under another
df = pd.concat([df1, df2], ignore_index=True)

In [21]:
print(len(df1))
print(len(df2))

9874
13645


In [22]:
len(df)

23519

In [23]:
df.head(2)

Unnamed: 0,PostId,post_body_text,EmbeddedContentText,Platform,ChannelID,POS_tagging_text
0,80074859,While in Camden today I had the privilege to v...,,Instagram,9955344,While in Camden today I had the privilege to v...
1,80069324,I had a great visit this afternoon with the Wi...,,Instagram,9955344,I had a great visit this afternoon with the Wi...


In [24]:
# keep only PostId and POS_tagging_text
df = df[['PostId', 'POS_tagging_text', 'Platform', 'ChannelID']]

In [25]:
# count NaN values in POS_tagging_text
df['POS_tagging_text'].isna().sum()

1095

In [26]:
len(df)

23519

In [27]:
# remove NaN values
df = df.dropna(subset=['POS_tagging_text'])

In [28]:
len(df)

22424

In [29]:
# print the first 10 rows of POS_tagging_text
df['POS_tagging_text'].head(10)

0     While in Camden today I had the privilege to v...
1     I had a great visit this afternoon with the Wi...
2     On November 29, the State Canvassing Board cer...
3     The State Canvassing Board certified the 2022 ...
4     RT @BaltCoElections: To all Baltimore County E...
5     See you bright and early, Owensboro. https://t...
6                 See you bright and early, Owensboro. 
8     Secretary Ashcroft will work alongside the @Hu...
9     Don’t be a deer in the headlights when it come...
10    ICYMI! If your business is interested in joini...
Name: POS_tagging_text, dtype: object

Clean text for POS tagging

In [30]:
import re
def clean_text(text):
    # If text is not a string (e.g., NaN), return an empty string or handle as needed.
    if not isinstance(text, str):
        return ""
    # Remove 'RT' as a standalone word.
    text = re.sub(r'\bRT\b', '', text)
    # Remove URLs.
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove @mentions.
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags.
    text = re.sub(r'#\w+', '', text)
    # Remove extra whitespace.
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the cleaning function to your text column.
df['POS_tagging_text'] = df['POS_tagging_text'].apply(clean_text)

# Check the cleaned output.
print(df['POS_tagging_text'].head(10))

0     While in Camden today I had the privilege to v...
1     I had a great visit this afternoon with the Wi...
2     On November 29, the State Canvassing Board cer...
3     The State Canvassing Board certified the 2022 ...
4     : To all Baltimore County Election Judges who ...
5                  See you bright and early, Owensboro.
6                  See you bright and early, Owensboro.
8     Secretary Ashcroft will work alongside the , h...
9     Don’t be a deer in the headlights when it come...
10    ICYMI! If your business is interested in joini...
Name: POS_tagging_text, dtype: object


In [31]:
# count NaN values in POS_tagging_text
df['POS_tagging_text'].isna().sum()

0

In [32]:
# count number of POS_tagging_text that are '' or ' '
print(len(df[df['POS_tagging_text'].str.strip() == '']))

# drop rows where POS_tagging_text is '' or ' '
df = df[df['POS_tagging_text'].str.strip() != '']

print(len(df[df['POS_tagging_text'].str.strip() == '']))

113
0


In [33]:
df.head()

Unnamed: 0,PostId,POS_tagging_text,Platform,ChannelID
0,80074859,While in Camden today I had the privilege to v...,Instagram,9955344
1,80069324,I had a great visit this afternoon with the Wi...,Instagram,9955344
2,80072838,"On November 29, the State Canvassing Board cer...",Facebook,8894188
3,80076411,The State Canvassing Board certified the 2022 ...,Twitter,8891709
4,80107630,: To all Baltimore County Election Judges who ...,Twitter,8891704


In [34]:
# save
df.to_csv('../clean/clean_posts_for_POS.csv', index=False)