In [1]:
# import dependencies

import re
import os
import csv
import pandas as pd
from dotenv import load_dotenv
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import numpy as np

# uncomment below if running locally and wordcloud exists in your environment
# from wordcloud import WordCloud 

%matplotlib inline

### Process 2020 Social Media Post Data

In [2]:
# Define the file path
file_path = "../Source_data/posts_2020.csv"

# Initialize an empty DataFrame to hold the filtered data
posts_2020_df = pd.DataFrame()

# Define the acceptable values for the 'country' column
acceptable_countries = ["United States", "United States of America"]

# Process the file in chunks to handle large files
chunk_size = 10**6  # Adjust the chunk size based on your system's memory capacity
for chunk in pd.read_csv(
    file_path,
    engine='python',                  # Use Python engine to handle more complex cases
    on_bad_lines='skip',              # Skip problematic lines
    delimiter=',',                    # Ensure correct delimiter
    chunksize=chunk_size              # Read the file in chunks
):
    # Filter rows based on the 'country' column
    filtered_chunk = chunk[chunk['country'].isin(acceptable_countries)]
    # Append the filtered chunk to the main DataFrame
    posts_2020_df = pd.concat([posts_2020_df, filtered_chunk], ignore_index=True)

# Verify the result
posts_2020_df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1.316529221557252e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666534.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,...,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:02,1.316529228091847e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",...,1185.0,Portland,45.5202471,-122.6741949,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
2,2020-10-15 00:00:08,1.3165292523014513e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413798.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",...,5393.0,Washington DC,38.8949924,-77.0365581,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121
3,2020-10-15 00:00:17,1.316529291052675e+18,@CLady62 Her 15 minutes were over long time ag...,2.0,0.0,Twitter for Android,1138416104.0,Farris Flagg,FarrisFlagg,#BidenHarris2020 #JoeBiden2020 #KamalaHarrisFo...,...,2363.0,"Perris,California",33.7825194,-117.2286478,,United States of America,North America,California,CA,2020-10-21 00:00:01.866082651
4,2020-10-15 00:00:18,1.3165292934979625e+18,@DeeviousDenise @realDonaldTrump @nypost There...,0.0,0.0,Twitter for iPhone,9.007610716314296e+17,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,"Patriot, Wife, “Shaken not Stirred” Mom of two...",...,766.0,"Ohio, USA",40.2253569,-82.6881395,,United States of America,North America,Ohio,OH,2020-10-21 00:00:02.612515712


In [3]:
# Restrict dataframe to columns of interest

# Rename columns
posts_2020_df.rename(columns={
    'user_screen_name': 'handle',
    'tweet': 'text',
    'created_at': 'time',
    'tweet_id': 'id'
}, inplace=True)

# Select only the required columns
posts_2020_df = posts_2020_df[['id', 'handle', 'text', 'time']]

# Verify the result
posts_2020_df.head()

Unnamed: 0,id,handle,text,time
0,1.316529221557252e+18,elsollatinonews,#Elecciones2020 | En #Florida: #JoeBiden dice ...,2020-10-15 00:00:01
1,1.316529228091847e+18,snarke,"#Trump: As a student I used to hear for years,...",2020-10-15 00:00:02
2,1.3165292523014513e+18,Ranaabtar,You get a tie! And you get a tie! #Trump ‘s ra...,2020-10-15 00:00:08
3,1.316529291052675e+18,FarrisFlagg,@CLady62 Her 15 minutes were over long time ag...,2020-10-15 00:00:17
4,1.3165292934979625e+18,sm_gulledge,@DeeviousDenise @realDonaldTrump @nypost There...,2020-10-15 00:00:18


In [5]:
sentiments20 = []

for comment in posts_2020_df['text']:
    try:
        # Ensure comment is a string
        if not isinstance(comment, str):
            raise ValueError("Comment is not a string")
            
        results = analyzer.polarity_scores(comment)
               
        sentiments20.append({
            'Compound': results['compound'],
            'Positive': results['pos'],
            'Negative': results['neg'],
            'Neutral': results['neu'],
            'text': comment,
        })
        
    except AttributeError:
        pass
    
sm20 = pd.DataFrame(sentiments20)
sm20.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,..."
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...


In [6]:
#summary statistics sm20
sm20.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,213259.0,213259.0,213259.0,213259.0
mean,-0.009011,0.08017,0.080945,0.838883
std,0.476845,0.105996,0.110512,0.145215
min,-0.9995,0.0,0.0,0.012
25%,-0.3612,0.0,0.0,0.741
50%,0.0,0.024,0.0,0.848
75%,0.3612,0.138,0.141,1.0
max,0.9987,0.958,0.988,1.0


In [None]:
# Export CSV file with summary info

In [15]:
# Function to calculate sentiment score for the entire text
def get_text_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# Function to tokenize text and return only alphabetic tokens
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha()]

# Process chunks of data to avoid memory issues
def process_chunk(chunk):
    chunk = chunk.copy()  # Make a copy to avoid SettingWithCopyWarning
    
    # Calculate text sentiment
    chunk['text_sentiment'] = chunk['text'].apply(get_text_sentiment)
    
    # Tokenize and explode tokens
    tokens_df = chunk['text'].apply(lambda x: tokenize_text(x))
    tokens_exploded = tokens_df.explode().reset_index(drop=True)
    tokens_exploded.name = 'token'
    tokens_exploded = tokens_exploded.to_frame().join(chunk[['text_sentiment']].reset_index(drop=True), how='left')
    
    # Calculate token counts and average sentiment
    token_counts = tokens_exploded['token'].value_counts().reset_index()
    token_counts.columns = ['token', 'token_count']
    token_sentiments = tokens_exploded.groupby('token')['text_sentiment'].mean().reset_index(name='avg_sentiment')
    
    # Merge counts and sentiments
    token_summary = pd.merge(token_counts, token_sentiments, on='token')
    
    return token_summary

# Create a generator for processing chunks
def chunk_generator(df, chunk_size):
    for start in range(0, len(df), chunk_size):
        yield df.iloc[start:start + chunk_size].copy()

# Process data in chunks
chunk_size = 1000  # Adjust chunk size based on available memory
processed_chunks = [process_chunk(chunk) for chunk in chunk_generator(posts_2020_df, chunk_size)]
sm20_processed = pd.concat(processed_chunks, ignore_index=True)

# Clean up memory
del processed_chunks
gc.collect()

# View the processed DataFrame
sm20_processed.head()

Unnamed: 0,token,token_count,avg_sentiment
0,trump,1033,-0.040622
1,the,830,-0.056141
2,to,581,0.002248
3,https,575,-0.140759
4,a,384,-0.014446


In [None]:
# Create a DataFrame to align with the `sm20_with_candidates`
# For demonstration, assuming that sm20_with_candidates has 'text' and 'candidate' columns.
def expand_with_candidates(df):
    # Explode the token column
    token_expanded = df.explode('token').reset_index(drop=True)
    
    # Merge with the original sentiment DataFrame
    df_expanded = pd.merge(token_expanded, sm20_with_candidates[['text', 'candidate']], on='text', how='left')
    
    # Calculate unique token counts for each candidate
    unique_token_counts = df_expanded.groupby(['candidate', 'token']).size().reset_index(name='unique_token_count')
    
    # Merge with average sentiment scores
    final_df = pd.merge(df_expanded, unique_token_counts, on=['candidate', 'token'], how='left')
    
    return final_df

# Call the function to expand with candidates
token_sentiments_2020 = expand_with_candidates(sm20_processed)

# View the final DataFrame
print(token_sentiments_2020.head())


In [None]:
# Average sentiment score per token
average_sentiments_2020 = sm20_processed.sort_values(by='avg_sentiment', ascending=False).reset_index(drop=True)
print("Average sentiment scores per token for 2020:")
print(average_sentiments_2020)

In [None]:
# Filter out tokens that are not English words
english_words = set(words.words())
filtered_tokens_df = sm20_processed[sm20_processed['token'].isin(english_words)]

# Sort by average sentiment in descending order
average_sentiments_2020 = filtered_tokens_df.sort_values(by='avg_sentiment', ascending=False).reset_index(drop=True)

print("Average sentiment scores per token for 2020 (filtered for English words):")
print(average_sentiments_2020)

In [None]:
# Define a function to determine the candidate based on keywords in the text
def determine_candidate(text):
    text = text.lower()
    republican_keywords = ['trump', 'donald']
    democrat_keywords = ['joe', 'biden']
    
    if any(keyword in text for keyword in democrat_keywords):
        return 'democrat'
    elif any(keyword in text for keyword in republican_keywords):
        return 'republican'
    else:
        return 'Unknown'

In [None]:
# Apply the function to the 'text' column of posts_2020_df to create the 'candidate' column
posts_2020_df['candidate'] = posts_2020_df['text'].apply(determine_candidate)

# Verify the result
posts_2020_df.head()

In [None]:
# Remove entries where the 'candidate' column is 'Unknown'
posts_2020_df = posts_2020_df[posts_2020_df['candidate'] != 'Unknown']

# Verify the result
posts_2020_df.head()

In [None]:
# Count the number of 'Unknown' entries in the 'candidate' column
unknown_count = posts_2020_df['candidate'].value_counts().get('Unknown', 0)

print(f"Number of 'Unknown' entries: {unknown_count}")

In [None]:
# Function to tokenize text
def tokenize_text(text):
    # Your implementation for tokenizing text
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lower case
    return [token for token in tokens if token.isalpha()]  # Return only alphabetic tokens

# Function to process each chunk
def process_chunk(chunk, token_sentiments_df):
    # Explode tokens in the chunk
    chunk['tokens'] = chunk['text'].apply(tokenize_text)
    chunk_expanded = chunk.explode('tokens')
    chunk_expanded.rename(columns={'tokens': 'token'}, inplace=True)
    
    # Merge with token sentiments data
    merged_chunk = chunk_expanded.merge(token_sentiments_df, on='token', how='left')
    
    # Aggregate sentiment values and token counts by candidate and token
    token_sentiments_chunk = merged_chunk.groupby(['candidate', 'token']).agg({
        'sentiment_value': 'mean',
        'unique_token_count': 'sum'
    }).reset_index()
    
    return token_sentiments_chunk

In [None]:
# Initialize an empty DataFrame to hold the aggregated results
aggregated_results = pd.DataFrame()

# Process data in chunks
num_chunks = len(sm20_with_candidates) // chunk_size + 1
for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(sm20_with_candidates))
    chunk = sm20_with_candidates.iloc[start:end].copy()  # Make a copy to avoid SettingWithCopyWarning
    
    token_sentiments_chunk = process_chunk(chunk, sm20_processed)
    
    # Append the results for each chunk
    aggregated_results = pd.concat([aggregated_results, token_sentiments_chunk], ignore_index=True)
    
    # Clean up memory
    del chunk, token_sentiments_chunk
    gc.collect()

# Final aggregation to ensure no duplicates and accurate counts
final_results = aggregated_results.groupby(['candidate', 'token']).agg({
    'sentiment_value': 'mean',
    'unique_token_count': 'sum'
}).reset_index()

# Verify the result
final_results.head()


In [None]:
# Merge sm20 with posts_2020_df on the 'text' column to add 'candidate'
sm20_with_candidates = pd.merge(sm20, posts_2020_df[['text', 'candidate']], on='text', how='left')

# View dataframe
sm20_with_candidates.head()

In [None]:
# Identify all token columns
token_columns = [col for col in sm20_with_candidates.columns if col.startswith('token_')]

# Ensure that the value name does not match any existing column names
# Checking for potential conflicts
existing_columns = set(sm20_with_candidates.columns)
unique_value_name = 'token_count' if 'token_count' not in existing_columns else 'token_count_unique'

# Reshape the DataFrame from wide to long format for tokens
tokens_long_df = sm16_with_candidates.melt(
    id_vars=['text', 'candidate'], 
    value_vars=token_columns,
    var_name='token',
    value_name=unique_value_name
)

# Filter out rows where token_count is 0
tokens_long_df = tokens_long_df[tokens_long_df[unique_value_name] > 0]

# Melt sentiment columns to long format
sentiment_columns = ['Compound', 'Positive', 'Negative', 'Neutral']
sentiments_long_df = sm16_with_candidates.melt(
    id_vars=['text', 'candidate'],
    value_vars=sentiment_columns,
    var_name='sentiment_type',
    value_name='sentiment_value'
)

# Merge token presence with sentiment values on 'text' and 'candidate'
merged_df = tokens_long_df.merge(sentiments_long_df, on=['text', 'candidate'])

# Aggregate sentiment values by candidate and token
token_sentiments_2016 = merged_df.groupby(['candidate', 'token']).agg({
    'sentiment_value': 'mean',
    unique_value_name: 'sum'
}).reset_index()

# Verify the result
token_sentiments_2016.head()

In [None]:
# Aggregate sentiment scores by candidate

# Calculate weighted average sentiment score for each candidate
# This step uses the 'sentiment_value' and 'token_count_unique' columns
candidate_sentiments = token_sentiments_2016.groupby('candidate').apply(
    lambda df: (df['sentiment_value'] * df['token_count_unique']).sum() / df['token_count_unique'].sum()
).reset_index(name='average_sentiment')

# Display the aggregated sentiment scores
print("Average sentiment scores by candidate:")
print(candidate_sentiments)

In [None]:
# Combine sentiment scores with other features

# Merge candidate sentiments with posts_2016_df
# Make sure to use the 'candidate' column for merging
sent_2016_df = pd.merge(posts_2016_df, candidate_sentiments, on='candidate', how='left')

# Display the final DataFrame with the required columns
sent_2016_df = sent_2016_df[['id', 'text', 'time', 'candidate', 'average_sentiment']]
sent_2016_df.head()

In [None]:
# Edit the time column to display only the year

# Ensure the 'time' column is in datetime format
sent_2016_df['time'] = pd.to_datetime(sent_2016_df['time'], errors='coerce')

# Extract the year from the datetime column and update the 'time' column
sent_2016_df['time'] = sent_2016_df['time'].dt.year

# Display the modified DataFrame
sent_2016_df.head()

In [None]:
# Save the final 2016 DataFrame to a CSV file
sent_2016_df.to_csv('../CSV_Outputs/sent_2016_df.csv', index=False)

In [None]:
# Plot average sentiment scores for each candidate

# Define color palette to match the colleague's visual
color_palette = {'Democrat': 'blue', 'Republican': 'red'}

# Create a seaborn bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='candidate', y='average_sentiment', data=candidate_sentiments, palette=color_palette)

# Add labels and title
plt.xlabel('Candidate')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Score by Candidate')

# Show the plot
plt.show()

In [None]:
# Display positive, negative, and neutral sentiment scores by candidate

# Aggregate sentiment scores by candidate
sentiment_summary = sm16_with_candidates.groupby('candidate')[['Positive', 'Negative', 'Neutral']].sum().reset_index()

# Define color palette for sentiments
color_palettes = {
    'Democrat': {
        'Positive': 'blue',  # Replace 'strongblue' with actual color code
        'Negative': 'lightblue',   # Replace 'palerblue' with actual color code
        'Neutral': 'lightgrey'
    },
    'Republican': {
        'Positive': 'darkred',     # Strong red
        'Negative': 'lightcoral',  # Pale red
        'Neutral': 'lightgrey'
    }
}

# Define a function to plot a pie chart for a given candidate
def plot_pie_chart(candidate, positive, negative, neutral):
    sentiments = [positive, negative, neutral]
    labels = ['Positive', 'Negative', 'Neutral']
    colors = color_palettes.get(candidate, {
        'Positive': 'grey',
        'Negative': 'grey',
        'Neutral': 'grey'
    }).values()
    
    plt.figure(figsize=(8, 8))
    plt.pie(sentiments, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
    plt.title(f'Sentiment Distribution for {candidate}')
    plt.show()

# Plot pie charts for each candidate
for index, row in sentiment_summary.iterrows():
    candidate = row['candidate']
    positive = row['Positive']
    negative = row['Negative']
    neutral = row['Neutral']
    plot_pie_chart(candidate, positive, negative, neutral)


In [None]:
# Generate a Word Cloud by candidate (uncomment below if running in CoLab)

# !pip install wordcloud
# from wordcloud import WordCloud

In [None]:
# Aggregate token counts by candidate
token_frequencies = token_sentiments_2016.groupby('candidate')['token_count'].sum().reset_index()

# Define a function to generate and display a word cloud for a given candidate
def generate_word_cloud(candidate, token_counts):
    # Create a dictionary of token frequencies
    token_freq_dict = dict(zip(token_counts['token'], token_counts['token_count']))
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate_from_frequencies(token_freq_dict)
    
    # Plot word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # No axes for word cloud
    plt.title(f'Word Cloud for {candidate}')
    plt.show()

# Generate word clouds for each candidate
candidates = token_sentiments_2016['candidate'].unique()
for candidate in candidates:
    # Filter token counts for the current candidate
    candidate_tokens = token_sentiments_2016[token_sentiments_2016['candidate'] == candidate]
    
    # Generate and display word cloud
    generate_word_cloud(candidate, candidate_tokens)