In [12]:
# import dependencies

import re
import os
import csv
import pandas as pd
from dotenv import load_dotenv
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

### Process 2016 Social Media Post Data

In [2]:
# Read in 2016 CSV file
posts_2016_df = pd.read_csv("../Source_Data/posts_2016.csv",)

# Display sample data
posts_2016_df.head(5)

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",


In [3]:
# Restrict dataframe to columns of interest

# Define the columns to keep
columns_to_keep = ['id', 'handle', 'text', 'time']

# Drop all columns except the specified ones
posts_2016_df = posts_2016_df[columns_to_keep]

# View dataframe
posts_2016_df.head()

Unnamed: 0,id,handle,text,time
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",2016-09-27T23:45:00
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,2016-09-27T23:26:40
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27


In [4]:
sentiments = []

for comment in posts_2016_df['text']:
    try:
        # text = comment['text']
        # results = analyzer.polarity_scores(text)
        results = analyzer.polarity_scores(comment)
        compound = results['compound']
        pos = results['pos']
        neu = results['neu']
        neg = results['neg']
        
        sentiments.append({
            'Compound': compound,
            'Positive': pos,
            'Negative': neg,
            'Neutral': neu,
            # 'text': text,
            'text': comment,
        })
    # except AttributeError:
    #     pass

    except Exception as e:
        print(f"Error processing comment: {e}")
        continue
    
sm16 = pd.DataFrame(sentiments)
sm16.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes..."
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't..."
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...


In [5]:
#summary statistics sm16
sm16.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,6444.0,6444.0,6444.0,6444.0
mean,0.13463,0.148067,0.075284,0.776647
std,0.501748,0.154608,0.113624,0.160151
min,-0.9652,0.0,0.0,0.204
25%,-0.18005,0.0,0.0,0.667
50%,0.07375,0.122,0.0,0.7765
75%,0.5461,0.25,0.136,0.891
max,0.9735,0.796,0.773,1.0


In [6]:
# Tokenize the text and calculate sentiment for each token
def get_token_sentiments_2016(text):
    tokens16 = word_tokenize(text.lower())  # Tokenize and convert to lower case
    token_sentiments_2016 = {}
    
    for token in tokens16:
        if token.isalpha():  # Consider only alphabetic tokens
            # Analyze sentiment for each token in the context of the whole text
            sentiment_2016 = analyzer.polarity_scores(token)
            token_sentiments_2016[token] = sentiment_2016['compound']
    
    return token_sentiments_2016

In [7]:
# Apply the function to each row and expand the results into a DataFrame
def process_text_2016(text):
    token_sentiments_2016 = get_token_sentiments_2016(text)
    return token_sentiments_2016

In [8]:
# Apply the function to the 'text' column
sm16['token_sentiments_2016'] = sm16['text'].apply(process_text_2016)

# Expand token sentiments into separate columns
tokens_2016_df = pd.json_normalize(sm16['token_sentiments_2016'])

# Rename columns in tokens_df to avoid overlap with original sm16 columns
tokens_2016_df.columns = [f'token_{col}' for col in tokens_2016_df.columns]

sm16 = sm16.drop(columns=['token_sentiments_2016']).join(tokens_2016_df)

# Analyze and visualize token sentiment
sm16.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,token_the,token_question,token_in,token_this,token_election,...,token_morocco,token_illustrates,token_duh,token_lilredfrmkokomo,token_lowell,token_iloveidevices,token_minimizing,token_dependency,token_salriccobono,token_troyconway
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",,,,,,...,,,,,,,,,,
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,,,,,,...,,,,,,,,,,
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",,,,,,...,,,,,,,,,,
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,,,,,,...,,,,,,,,,,


In [9]:
# Average sentiment score per token
average_sentiments_2016 = tokens_2016_df.mean().sort_values(ascending=False)
print("Average sentiment scores per token:")
print(average_sentiments_2016)

Average sentiment scores per token:
token_magnificently    0.6597
token_sweetheart       0.6486
token_glorious         0.6369
token_greatest         0.6369
token_best             0.6369
                        ...  
token_hell            -0.6808
token_terrorist       -0.6908
token_kill            -0.6908
token_murder          -0.6908
token_rapist          -0.7096
Length: 8496, dtype: float64


In [15]:
# Define a function to determine the candidate based on the handle
def determine_candidate_from_handle(handle):
    handle = handle.lower()  # Convert handle to lower case for case-insensitive comparison
    
    # Define patterns for combined keywords
    democrat_keywords = ['hillaryclinton']
    republican_keywords = ['realdonaldtrump']
    
    # Check if any of the Democrat patterns are present in the handle
    if any(keyword in handle for keyword in democrat_keywords):
        return 'Democrat'
    
    # Check if any of the Republican patterns are present in the handle
    elif any(keyword in handle for keyword in republican_keywords):
        return 'Republican'
    
    # Return 'Unknown' if no patterns are matched
    else:
        return 'Unknown'

In [16]:
# Apply the function to the 'handle' column to create/update the 'candidate' column
posts_2016_df['candidate'] = posts_2016_df['handle'].apply(determine_candidate_from_handle)

# Optional: Verify the result
posts_2016_df.head()

Unnamed: 0,id,handle,text,time,candidate
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,2016-09-28T00:22:34,Democrat
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",2016-09-27T23:45:00,Democrat
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,2016-09-27T23:26:40,Democrat
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",2016-09-27T23:08:41,Democrat
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,2016-09-27T22:30:27,Democrat


In [17]:
# Count the number of 'Unknown' entries in the 'candidate' column
unknown_count = posts_2016_df['candidate'].value_counts().get('Unknown', 0)

print(f"Number of 'Unknown' entries: {unknown_count}")

Number of 'Unknown' entries: 0


In [None]:
# correct code below

In [18]:
# Assign tokens to candidates

# Calculate frequency and sentiment score for each token
token_sentiments_2016 = tokens_2016_df.groupby(['candidate', 'token']).agg({
    'sentiment': ['mean', 'count']  # Mean sentiment and count of tokens
}).reset_index()

# Flatten the multi-level columns
token_sentiments_2016.columns = ['candidate', 'token', 'avg_sentiment', 'token_count']

KeyError: 'candidate'

In [None]:
# correct code below

In [None]:
# Aggregate sentiment scores by candidate
candidate_sentiment = token_sentiments.groupby('candidate').agg({
    'avg_sentiment': lambda x: (x * token_sentiments[token_sentiments['candidate'] == x.name]['token_count']).sum() / token_sentiments[token_sentiments['candidate'] == x.name]['token_count'].sum(),  # Weighted average sentiment
    'token_count': 'sum'
}).reset_index()

# Rename columns for clarity
candidate_sentiment.columns = ['candidate', 'overall_sentiment_score', 'total_token_count']


### Process 2020 Social Media Post Data

In [None]:
# Define the file path
file_path = "../Source_data/posts_2020.csv"

# Initialize an empty DataFrame to hold the filtered data
posts_2020_df = pd.DataFrame()

# Define the acceptable values for the 'country' column
acceptable_countries = ["United States", "United States of America"]

# Process the file in chunks to handle large files
chunk_size = 10**6  # Adjust the chunk size based on your system's memory capacity
for chunk in pd.read_csv(
    file_path,
    engine='python',                  # Use Python engine to handle more complex cases
    on_bad_lines='skip',              # Skip problematic lines
    delimiter=',',                    # Ensure correct delimiter
    chunksize=chunk_size              # Read the file in chunks
):
    # Filter rows based on the 'country' column
    filtered_chunk = chunk[chunk['country'].isin(acceptable_countries)]
    # Append the filtered chunk to the main DataFrame
    posts_2020_df = pd.concat([posts_2020_df, filtered_chunk], ignore_index=True)

# Verify the result
posts_2020_df.head()

In [None]:
# Restrict dataframe to columns of interest

# Rename columns
posts_2020_df.rename(columns={
    'user_screen_name': 'handle',
    'tweet': 'text',
    'created_at': 'time',
    'tweet_id': 'id'
}, inplace=True)

# Select only the required columns
posts_2020_df = posts_2020_df[['id', 'handle', 'text', 'time']]

# Verify the result
posts_2020_df.head()

In [None]:
sentiments2 = []

for comment in posts_2020_df['text']:
    try:
        # Directly use the comment since it's already a string
        text = comment
        results = analyzer.polarity_scores(text)
        compound = results['compound']
        pos = results['pos']
        neu = results['neu']
        neg = results['neg']
        
        sentiments2.append({
            'Compound': compound,
            'Positive': pos,
            'Negative': neg,
            'Neutral': neu,
            'text': text,
        })
    except AttributeError:
        pass
    
sm20 = pd.DataFrame(sentiments2)
sm20.head()

In [None]:
#summary statistics sm20
sm20.describe()

In [None]:
# Tokenize the text and calculate sentiment for each token
def get_token_sentiments_2016(text):
    tokens16 = word_tokenize(text.lower())  # Tokenize and convert to lower case
    token_sentiments_2016 = {}
    
    for token in tokens16:
        if token.isalpha():  # Consider only alphabetic tokens
            # Analyze sentiment for each token in the context of the whole text
            sentiment_2016 = analyzer.polarity_scores(token)
            token_sentiments_2016[token] = sentiment_2016['compound']
    
    return token_sentiments_2016

In [None]:
# Apply the function to each row and expand the results into a DataFrame
def process_text_2016(text):
    token_sentiments_2016 = get_token_sentiments_2016(text)
    return token_sentiments_2016

In [None]:
# Apply the function to the 'text' column
sm16['token_sentiments_2016'] = sm16['text'].apply(process_text_2016)

# Expand token sentiments into separate columns
tokens_2016_df = pd.json_normalize(sm16['token_sentiments_2016'])

# Rename columns in tokens_df to avoid overlap with original sm16 columns
tokens_2016_df.columns = [f'token_{col}' for col in tokens_2016_df.columns]

sm16 = sm16.drop(columns=['token_sentiments_2016']).join(tokens_2016_df)

# Analyze and visualize token sentiment
sm16.head()

In [None]:
# Average sentiment score per token
average_sentiments = tokens_df.mean().sort_values(ascending=False)
print("Average sentiment scores per token:")
print(average_sentiments)

In [None]:
# Define a function to determine the candidate based on the text
def determine_candidate_2016(text):
    text = text.lower()  # Convert text to lower case for case-insensitive comparison
    if any(keyword in text for keyword in ['hillary', 'clinton']):
        return 'Democrat'
    elif any(keyword in text for keyword in ['donald', 'trump']):
        return 'Republican'
    else:
        return 'Unknown'  # Or another label for texts that don't match

In [None]:
# Apply the function to the 'text' column to create the 'candidate' column
posts_2016_df['candidate'] = posts_2016_df['text'].apply(determine_candidate_2016)

# Verify the result
posts_2016_df.head()

In [None]:
# Count the number of 'Unknown' entries in the 'candidate' column
unknown_count = posts_2016_df['candidate'].value_counts().get('Unknown', 0)

print(f"Number of 'Unknown' entries: {unknown_count}")

In [None]:
# # reading datasets 
# trump = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n') 
# print(trump.head(3)) 

In [None]:
# # Display all the columns in the DataFrame 
# print(trump.columns)

In [None]:
# biden = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n') 
# print(biden.head(2)) 

In [None]:
# print(trump.shape) 
# print(biden.shape)

In [None]:
# # Getting trump dataset information 
# trump.info() 

In [None]:
# # Getting biden dataset information 
# biden.info() 

In [None]:
# # creating a new column 'candidate' todifferentiate 
# # between tweets of Trump and Biden upon concatination 
# trump['candidate'] = 'trump'

# # biden dataframe 
# biden['candidate'] = 'biden'

# # combining the dataframes 
# data = pd.concat([trump, biden]) 

# # FInal data shape 
# print('Final Data Shape :', data.shape) 

# # View the first 2 rows 
# print("\nFirst 2 rows:") 
# print(data.head(3)) 

In [None]:
# # dropping null values if they exist 
# data.dropna(inplace=True) 

In [None]:
# data['country'].value_counts()

In [None]:
# data['country'] = data['country'].replace({'United States of America': "US", 
# 										'United States': "US"}) 

In [None]:
# # Group the data by 'candidate' and count the 
# # number of tweets for each candidate 
# tweets_count = data.groupby('candidate')['tweet'].count().reset_index() 

# # Interactive bar chart 
# fig = px.bar(tweets_count, x='candidate', y='tweet', color='candidate', 
# 			color_discrete_map={'Trump': 'pink', 'Biden': 'blue'}, 
# 			labels={'candidate': 'Candidates', 'tweet': 'Number of Tweets'}, 
# 			title='Tweets for Candidates') 

# # Show the chart 
# fig.show() 

In [None]:
# # Interactive bar chart 
# likes_comparison = data.groupby('candidate')['likes'].sum().reset_index() 
# fig = px.bar(likes_comparison, x='candidate', y='likes', color='candidate', 
# 			color_discrete_map={'Trump': 'blue', 'Biden': 'green'}, 
# 			labels={'candidate': 'Candidate', 'likes': 'Total Likes'}, 
# 			title='Comparison of Likes') 

# # Update the layout with a black theme 
# fig.update_layout(plot_bgcolor='black', 
# 				paper_bgcolor='black', font_color='white') 

# # Show the chart 
# fig.show() 

In [None]:
# # Top10 Countrywise tweets Counts 
# top10countries = data.groupby('country')['tweet'].count( 
# ).sort_values(ascending=False).reset_index().head(10) 
# # top10countries 

# # Interactive bar chart 
# fig = px.bar(top10countries, x='country', y='tweet', 
# 			template='plotly_dark', 
# 			color_discrete_sequence=px.colors.qualitative.Dark24_r, 
# 			title='Top10 Countrywise tweets Counts') 

# # To view the graph 
# fig.show() 

In [None]:
# # the number of tweets done for each 
# # candidate by all the countries. 
# tweet_df = data.groupby(['country', 'candidate'])[ 
# 	'tweet'].count().reset_index() 

# # Candidate for top 10 country tweet 
# tweeters = tweet_df[tweet_df['country'].isin(top10countries.country)] 

# # Plot for tweet counts for each candidate 
# # in the top 10 countries 
# fig = px.bar(tweeters, x='country', y='tweet', color='candidate', 
# 			labels={'country': 'Country', 'tweet': 'Number of Tweets', 
# 					'candidate': 'Candidate'}, 
# 			title='Tweet Counts for Each Candidate in the Top 10 Countries', 
# 			template='plotly_dark', 
# 			barmode='group') 

# # Show the chart 
# fig.show() 

In [None]:
# def clean(text): 
# 	# Remove URLs 
# 	text = re.sub(r'https?://\S+|www\.\S+', '', str(text)) 

# 	# Convert text to lowercase 
# 	text = text.lower() 

# 	# Replace anything other than alphabets a-z with a space 
# 	text = re.sub('[^a-z]', ' ', text) 

# 	# Split the text into single words 
# 	text = text.split() 

# 	# Initialize WordNetLemmatizer 
# 	lm = WordNetLemmatizer() 

# 	# Lemmatize words and remove stopwords 
# 	text = [lm.lemmatize(word) for word in text if word not in set( 
# 		stopwords.words('english'))] 

# 	# Join the words back into a sentence 
# 	text = ' '.join(word for word in text) 

# 	return text 

In [None]:
# def getpolarity(text): 
# 	return TextBlob(text).sentiment.polarity 

# def getsubjectivity(text): 
# 	return TextBlob(text).sentiment.subjectivity 

# def getAnalysis(score): 
# 	if score < 0: 
# 		return 'negative'
# 	elif score == 0: 
# 		return 'neutral'
# 	else: 
# 		return 'positive'

<!-- ## Donald Trump Sentiment Analysis -->

In [None]:
# trump_tweets = data[data['candidate'] == 'trump'] 

# # taking only U.S. country data 
# trump_tweets = trump_tweets.loc[trump_tweets.country == 'US'] 
# trump_tweets = trump_tweets[['tweet']] 
# print(trump_tweets.head()) 

In [None]:
# trump_tweets['cleantext'] = trump_tweets['tweet'].apply(clean) 
# print(trump_tweets.head()) 

In [None]:
# trump_tweets['subjectivity'] = trump_tweets['cleantext'].apply(getsubjectivity) 

In [None]:
# trump_tweets['polarity'] = trump_tweets['cleantext'].apply(getpolarity) 

In [None]:
# trump_tweets['analysis'] = trump_tweets['polarity'].apply(getAnalysis) 
# trump_tweets.head() 

In [None]:
# # how much data is positive/negetive/neutral 
# plt.style.use('dark_background') # Adding black theme 

# # Define colors for each bar 
# colors = ['orange', 'blue', 'red'] 

# plt.figure(figsize=(7, 5)) 
# (trump_tweets.analysis.value_counts(normalize=True) * 100).plot.bar(color=colors) 
# plt.ylabel("%age of tweets") 
# plt.title("Distribution of Sentiments towards Trump") 
# plt.show() 

In [None]:
# def word_cloud(wd_list): 
# 	stopwords = set(STOPWORDS) 
# 	all_words = ' '.join() 
# 	wordcloud = WordCloud(background_color='black', 
# 						stopwords=stopwords, 
# 						width=1600, height=800, max_words=100, max_font_size=200, 
# 						colormap="viridis").generate(all_words) 
# 	plt.figure(figsize=(12, 10)) 
# 	plt.axis('off') 
# 	plt.imshow(wordcloud) 

# word_cloud(trump_tweets['cleantext'][:5000]) 

<!-- ## Joe Biden's Sentiment Analysis -->

In [None]:
# biden_tweets = data[data['candidate'] == 'biden'] 
# biden_tweets = biden_tweets.loc[biden_tweets.country == 'US'] 
# biden_tweets = biden_tweets[['tweet']] 
# biden_tweets

In [None]:
# biden_tweets['cleantext']=biden_tweets['tweet'].apply(clean) 
# biden_tweets.head()

In [None]:
# biden_tweets['subjectivity'] = biden_tweets['cleantext'].apply(getsubjectivity) 
# biden_tweets['polarity'] = biden_tweets['cleantext'].apply(getpolarity) 
# biden_tweets['analysis'] = biden_tweets['polarity'].apply(getAnalysis) 
# biden_tweets.head() 

In [None]:
# # how much data is positive/negetive/neutral 
# plt.style.use('dark_background') 

# # Define colors for each bar 
# colors = ['orange', 'green', 'red'] 

# plt.figure(figsize=(7, 5)) 
# (biden_tweets.analysis.value_counts(normalize=True) * 100).plot.bar(color=colors) 
# plt.ylabel("%age of tweets") 
# plt.title("Distribution of Sentiments towards Biden") 
# plt.show() 

In [None]:
# word_cloud(biden_tweets['cleantext'][:5000])

<!-- ## Trump Prediction -->

In [None]:
# Trump_Tweets.analysis.value_counts(normalize=True)*100

<!-- ## Biden Prediction -->

In [None]:
# Biden_Tweets.analysis.value_counts(normalize=True)*100

In [None]:
# Complete the 2016 dataframe 

# Add the 'year' column with the value 2016 for all rows
sm16['year'] = 2016

# Add the 'outcome' column with the value 'Republican' for all rows
sm16['outcome'] = 'Republican'

sm16.head()

In [None]:
# Export the dataframe as CSV file
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'CSV_Outputs'))

# Ensure the directory exists; create it if it doesn't
os.makedirs(output_dir, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_dir, 'sentiment_2016.csv')

# Export the DataFrame to a CSV file
posts_2016_df.to_csv(output_file_path, index=False)

# Optional: Print confirmation
print(f"File saved to {output_file_path}")

In [None]:
# Complete the 2020 dataframe 

# Add the 'year' column with the value 2020 for all rows
sm20['year'] = 2020

# Add the 'outcome' column with the value 'Democrat' for all rows
sm20['outcome'] = 'Democrat'

sm20.head()


In [None]:
# Define the directory one level higher than the current directory
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'CSV_Outputs'))

# Ensure the directory exists; create it if it doesn't
os.makedirs(output_dir, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_dir, 'sentiment_2020.csv')

# Export the DataFrame to a CSV file
posts_2020_df.to_csv(output_file_path, index=False)

# Optional: Print confirmation
print(f"File saved to {output_file_path}")