# Embeddings

This notebook is used to get the embeddings for the social media posts of the State Election Officials.

In [1]:
import ollama
import pandas as pd

In [2]:
df = pd.read_csv('../data/clean/clean_posts_for_POS.csv')

In [3]:
# rename POS_tagging_text to text
df.rename(columns={'POS_tagging_text': 'text'}, inplace=True)

df.head(2)

Unnamed: 0,PostId,text,Platform,ChannelID
0,80074859,While in Camden today I had the privilege to v...,Instagram,9955344
1,80069324,I had a great visit this afternoon with the Wi...,Instagram,9955344


In [5]:
def get_embeddings(text, model='mxbai-embed-large'):
    embeddings_dict = ollama.embeddings(model=model, prompt=text)
    return embeddings_dict["embedding"]

df['embeddings'] = df['text'].apply(get_embeddings)

In [6]:
df.head()

Unnamed: 0,PostId,text,Platform,ChannelID,embeddings
0,80074859,While in Camden today I had the privilege to v...,Instagram,9955344,"[0.17838525772094727, 0.2974681854248047, 0.38..."
1,80069324,I had a great visit this afternoon with the Wi...,Instagram,9955344,"[0.03257334604859352, -0.10667754709720612, -0..."
2,80072838,"On November 29, the State Canvassing Board cer...",Facebook,8894188,"[-0.4845951199531555, 0.4088035225868225, 0.41..."
3,80076411,The State Canvassing Board certified the 2022 ...,Twitter,8891709,"[-0.5558313727378845, 0.41650187969207764, 0.4..."
4,80107630,: To all Baltimore County Election Judges who ...,Twitter,8891704,"[0.26043787598609924, -0.23166483640670776, 0...."


In [7]:
# save as csv
df.to_csv('../data/clean/post_with_embeddings.csv', index=False)

In [8]:
df_info = pd.read_csv('../data/clean/merged_stateplatform.csv')
df_info.head()

Unnamed: 0,State,tenure_2024,Platform,ChannelId,prez_winner_2024,ceo_party,battleground_2020,battleground_2024
0,Alabama,1,Facebook,8894190,R,R,0,0
1,Alabama,1,FacebookDirect,26303262,R,R,0,0
2,Alabama,1,Twitter,8889063,R,R,0,0
3,Alaska,1,Facebook,8894198,R,R,0,0
4,Alaska,1,FacebookDirect,26302357,R,R,0,0


In [11]:
print(len(df))

22311


In [14]:
# First ensure column names match between dataframes
df_info = df_info.rename(columns={'ChannelId': 'ChannelID'})

# Perform the left merge
merged_df = df.merge(
    df_info,
    on=['Platform', 'ChannelID'],
    how='left'
)

print(len(merged_df))

22311


In [15]:
merged_df.head()

Unnamed: 0,PostId,text,Platform,ChannelID,embeddings,State,tenure_2024,prez_winner_2024,ceo_party,battleground_2020,battleground_2024
0,80074859,While in Camden today I had the privilege to v...,Instagram,9955344,"[0.17838525772094727, 0.2974681854248047, 0.38...",,,,,,
1,80069324,I had a great visit this afternoon with the Wi...,Instagram,9955344,"[0.03257334604859352, -0.10667754709720612, -0...",,,,,,
2,80072838,"On November 29, the State Canvassing Board cer...",Facebook,8894188,"[-0.4845951199531555, 0.4088035225868225, 0.41...",Minnesota,9.0,D,D,1.0,0.0
3,80076411,The State Canvassing Board certified the 2022 ...,Twitter,8891709,"[-0.5558313727378845, 0.41650187969207764, 0.4...",Minnesota,9.0,D,D,1.0,0.0
4,80107630,: To all Baltimore County Election Judges who ...,Twitter,8891704,"[0.26043787598609924, -0.23166483640670776, 0....",Maryland,1.0,D,,0.0,0.0


In [16]:
# Count NaN in State column
state_nan_count = merged_df['State'].isna().sum()
print(f"NaNs in State column: {state_nan_count}")

# Get unique ChannelIDs with missing State
nan_channel_ids = merged_df[merged_df['State'].isna()]['ChannelID'].unique()
print(f"Unique ChannelIDs with missing State: {len(nan_channel_ids)}")
print(nan_channel_ids)

NaNs in State column: 3355
Unique ChannelIDs with missing State: 48
[ 9955344  8891705  8891734  8897659  9331001  6859170  5250362  9955346
  9955359  8891707  8891714  9955356  9955353  8891710  3505415  9955362
  8889065  8402625  9330993  9955360  5451003  9955361  8894202  8894178
  9955350  8894182  8909405  8891736  9955348  8891735  9955354  8891723
  8894191  8891724  9955355  8894194  1364218  9703931  8894197  7397383
  9955342  6435628  9955352  9955358  6189337  8894192  8891741 26303180]


In [17]:
# save nan_channel_ids
pd.DataFrame(nan_channel_ids, columns=['ChannelID_missing_state']).to_csv('../data/clean/nan_channel_ids.csv', index=False)