In [46]:
import pandas as pd 
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import re

In [47]:
data = pd.read_csv('Tesla.csv')

In [48]:
cleaned_data = pd.DataFrame()
cleaned_data['original_tweet'] = data['tweet']
cleaned_data['cleaned_tweet'] = ""
cleaned_data['sentiment'] = ""
cleaned_data['compound_score'] = 0.0
cleaned_data['positive_score'] = 0.0
cleaned_data['negative_score'] = 0.0
cleaned_data['neutral_score'] = 0.0

In [50]:
#Preprocess our data to be a valid input for VADER
# --> VADER can handle emojis, punctuation, capitalization, slang, and negations ... so don't need to remove these items

cleaned_tweets = []
for tweet in data["tweet"]:
    tweet = str(tweet)

    #remove urls
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    #remove @'s
    tweet = re.sub(r'@\w+', '', tweet)

    #remove #'s
    tweet = re.sub(r'#', '', tweet)

    #remove special chars
    tweet = re.sub(r'[^\w\s]', '', tweet)

    #remove nums
    tweet = re.sub(r'\d+', '', tweet)

    #remove extra spaces
    tweet = tweet.strip()

    #add cleaned tweet to the arr
    cleaned_tweets.append(tweet)

cleaned_data["cleaned_tweet"] = cleaned_tweets

In [51]:
#create a VADER setniment analyzer:
sentiment_analyzer = SentimentIntensityAnalyzer()

# Get the sentiment/polarity score of each tweet:
for i in range(len(cleaned_data)):

    curr_tweet = cleaned_data["cleaned_tweet"][i]
    
    # curr_score is a dictionary that stores a positive, negative, neutral, and compound score for an individual tweet
    curr_score = sentiment_analyzer.polarity_scores(curr_tweet)

    cleaned_data.loc[i, "compound_score"] = curr_score["compound"]
    cleaned_data.loc[i, "positive_score"] = curr_score["pos"]
    cleaned_data.loc[i, "negative_score"] = curr_score["neg"]
    cleaned_data.loc[i, "neutral_score"] = curr_score["neu"]

    # Determine sentiment category
    if curr_score['compound'] >= 0.05:
        cleaned_data.loc[i, 'sentiment'] = 'Positive'
    elif curr_score['compound'] <= -0.05:
        cleaned_data.loc[i, 'sentiment'] = 'Negative'
    else:
        cleaned_data.loc[i, 'sentiment'] = 'Neutral'


In [52]:
# TESTING: 


# print("Total number of tweets:", len(cleaned_data))
# print("\nSentiment Distribution:")
# print(cleaned_data['sentiment'].value_counts())
# print("\nAverage Scores:")
# print("Compound:", cleaned_data['compound_score'].mean().round(3))
# print("Positive:", cleaned_data['positive_score'].mean().round(3))
# print("Negative:", cleaned_data['negative_score'].mean().round(3))
# print("Neutral:", cleaned_data['neutral_score'].mean().round(3))



# Look at first 10 rows of cleaned_data dataframe (which stores our results):
# print(cleaned_data.head(10))



# print("Number of columns:", cleaned_data.shape[1])



# print("\nColumn names:")
# for i, column in enumerate(cleaned_data.columns, 1):
#     print(f"{i}. {column}")



Number of columns: 7

Column names:
1. original_tweet
2. cleaned_tweet
3. sentiment
4. compound_score
5. positive_score
6. negative_score
7. neutral_score
