<center>
    
# Twitter Data Sentiment Analysis

**Student Name: _____________________________**

</center>

In [1]:
# Cell 1: Import necessary libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
from pymongo import MongoClient

### Reading the ProjectTweets.csv file

In [2]:
data = pd.read_csv('ProjectTweets.csv');

### Data Cleaning

In [3]:
def data_cleaning():
    # Loading the dataset
    data = pd.read_csv("ProjectTweets.csv")
    print(data.head())

    # Handle Missing Values
    missing_values = data.isnull().sum()
    print("Missing Values:\n", missing_values)

    # Handling Duplicates
    data.drop_duplicates(inplace=True)

    # Save the Cleaned Dataset
    data.to_csv("CleanedProjectTweets.csv", index=False)

    # display summary statistics or other exploratory data analysis
    print(data.describe())

data_cleaning()

   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  1  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  2  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  3  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  4  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  5  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....           

### Performing Sentiment Analysis on the cleaned data 

In [4]:
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()
df = pd.read_csv("CleanedProjectTweets.csv", header=None)

def calculate_sentiment(text):
    sentiment_scores = sia.polarity_scores(str(text))
    sentiment = ""
    if sentiment_scores["compound"] >= 0.05:
        sentiment = "positive"
    elif sentiment_scores["compound"] <= -0.05:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    return sentiment

tweet_column_index = 5
df["sentiment"] = df.iloc[:, tweet_column_index].apply(calculate_sentiment)
df.to_csv("sentiment_analysis_results.csv", index=False)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Forecasting

In [5]:
df = pd.read_csv('CleanedProjectTweets.csv', header=None)
df.columns = ['index', 'timestamp', 'date', 'query', 'username', 'tweet']
df['y'] = df['tweet'].str.len()
print(f"The shape of the DataFrame is: {df.shape}")

if 0 < len(df) < 1e6:
    df_sample = df.sample(frac=0.1)
    model_arima = ARIMA(df_sample['y'], order=(5, 1, 0))
    model_fit_arima = model_arima.fit()
    forecast_arima_week = model_fit_arima.forecast(steps=7)
    forecast_arima_month = model_fit_arima.forecast(steps=30)
    forecast_arima_3months = model_fit_arima.forecast(steps=90)

if len(df) > 0:
    X = np.array(range(len(df))).reshape(-1, 1)
    y = df['y']
    model_rf = RandomForestRegressor(n_estimators=100)
    model_rf.fit(X, y)
    future_week = np.array(range(len(df), len(df) + 7)).reshape(-1, 1)
    future_month = np.array(range(len(df), len(df) + 30)).reshape(-1, 1)
    future_3months = np.array(range(len(df), len(df) + 90)).reshape(-1, 1)
    forecast_rf_week = model_rf.predict(future_week)
    forecast_rf_month = model_rf.predict(future_month)
    forecast_rf_3months = model_rf.predict(future_3months)
else:
    print("The DataFrame is empty.")

plt.figure(figsize=(12, 6))

if 'forecast_arima_week' in locals():
    plt.subplot(1, 2, 1)
    plt.title('ARIMA Forecast')
    plt.plot(forecast_arima_3months, label='3 Months')
    plt.plot(forecast_arima_month, label='1 Month')
    plt.plot(forecast_arima_week, label='1 Week')
    plt.legend()

if 'forecast_rf_week' in locals():
    plt.subplot(1, 2, 2)
    plt.title('Random Forest Forecast')
    plt.plot(forecast_rf_3months, label='3 Months')
    plt.plot(forecast_rf_month, label='1 Month')
    plt.plot(forecast_rf_week, label='1 Week')
    plt.legend()

plt.show()

The shape of the DataFrame is: (1600000, 7)


MemoryError: could not allocate 134217728 bytes


### Create the dynamic dashboard

In [None]:
app = dash.Dash(__name__)

df = pd.read_csv('sentiment_analysis_results.csv')
df.columns = ['row_id', 'id', 'timestamp', 'query', 'user', 'text', 'sentiment']
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df_count = df.groupby([pd.Grouper(key='timestamp', freq='H'), 'sentiment']).size().reset_index(name='count')

app.layout = html.Div([
    html.H1("Tweet Sentiments Over Time"),

    dcc.Dropdown(
        id='sentiment-dropdown',
        options=[
            {'label': 'Positive', 'value': 'positive'},
            {'label': 'Negative', 'value': 'negative'},
            {'label': 'Neutral', 'value': 'neutral'},
            {'label': 'All', 'value': 'all'}
        ],
        value='all',  # Default value
        multi=False
    ),

    dcc.Graph(id='line-plot'),
])

@app.callback(
    Output('line-plot', 'figure'),
    [Input('sentiment-dropdown', 'value')]
)
def update_graph(selected_sentiment):
    if selected_sentiment == 'all':
        fig = px.line(df_count, x='timestamp', y='count', color='sentiment', title='All Sentiments Over Time')
    else:
        filtered_df = df_count[df_count['sentiment'] == selected_sentiment]
        fig = px.line(filtered_df, x='timestamp', y='count', title=f'{selected_sentiment.capitalize()} Sentiments Over Time')

    return fig

if __name__ == "__main__":
    app.run_server(debug=True)

### Saving the sentiment analysis results to MongoDB

In [None]:
# Define the MongoDB connection
client = MongoClient('localhost', 27017)
db = client['sentiment_analysis_db']
collection = db['sentiment_results']

# Read the sentiment analysis results from the CSV file
sentiment_results = pd.read_csv('sentiment_analysis_results.csv')

# Convert the dataframe to a dictionary
data_dict = sentiment_results.to_dict("records")

# Insert the data into the MongoDB collection
collection.insert_many(data_dict)

# Display success message
print("Sentiment analysis results successfully saved to MongoDB.")
