In [None]:
import pandas as pd
import numpy as np

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
data = pd.read_csv("/content/tweets.csv", encoding = "latin1")
df = data.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Retweets
0,0,2023-03-29 15:42:36+00:00,AnandPatni8,@vinodkapri @RahulGandhi Respected Indian Citi...,0.0,0.0
1,1,2023-03-29 15:42:05+00:00,dhinamum,"*Respected Indian Citizens,* Namaskaar I Am Th...",0.0,0.0
2,2,2023-03-29 15:34:29+00:00,PrincetonCGI,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,0.0,0.0
3,3,2023-03-29 15:31:43+00:00,RishiJoeSanu,@MrinalWahal Why would politicians stop using ...,0.0,0.0
4,4,2023-03-29 15:26:48+00:00,itweetsensee,@annamalai_k @narendramodi A state level presi...,0.0,0.0


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

print()

#Checking Duplicates
df.duplicated().sum()

print()
#Checking Data Types
df.info()

Unnamed: 0    0
Date          0
User          0
Tweet         1
Likes         2
Retweets      2
dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  50001 non-null  object 
 1   Date        50001 non-null  object 
 2   User        50001 non-null  object 
 3   Tweet       50000 non-null  object 
 4   Likes       49999 non-null  float64
 5   Retweets    49999 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.3+ MB


In [None]:
#Removing missing values
df = df.dropna()
df.isnull().sum()

Unnamed: 0    0
Date          0
User          0
Tweet         0
Likes         0
Retweets      0
dtype: int64

In [None]:
# Function to clean and preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Removing special characters, URLs, and mentions
        text = re.sub(r"http\S+|@\S+|[^A-Za-z0-9]+", " ", text)

        # Converting to lowercase
        text = text.lower()

        # Removing stopwords
        stop_words = set(stopwords.words("english"))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]

        return " ".join(filtered_text)
    else:
        return ""

# Applying the preprocessing function to the 'Tweet' column
df['Cleaned_Tweet'] = df['Tweet'].apply(preprocess_text)

In [None]:
# Initializing the VADER sentiment analyzer (Valence Aware Dictionary and Sentiment Reasoner)
sia = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_score(text_to_score):
    return sia.polarity_scores(text_to_score)['compound']

# Applying the sentiment analysis to the 'Cleaned_Tweet' column
df['Sentiment_Score'] = df['Cleaned_Tweet'].apply(get_sentiment_score)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Retweets,Cleaned_Tweet,Sentiment_Score
0,0,2023-03-29 15:42:36+00:00,AnandPatni8,@vinodkapri @RahulGandhi Respected Indian Citi...,0.0,0.0,rahulgandhi respected indian citizens namaskaa...,0.6705
1,1,2023-03-29 15:42:05+00:00,dhinamum,"*Respected Indian Citizens,* Namaskaar I Am Th...",0.0,0.0,respected indian citizens namaskaar original g...,0.6705
2,2,2023-03-29 15:34:29+00:00,PrincetonCGI,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,0.0,0.0,1 n meet filmmaker prakash jha new jersey talk...,0.5267
3,3,2023-03-29 15:31:43+00:00,RishiJoeSanu,@MrinalWahal Why would politicians stop using ...,0.0,0.0,would politicians stop using religion politics...,-0.296
4,4,2023-03-29 15:26:48+00:00,itweetsensee,@annamalai_k @narendramodi A state level presi...,0.0,0.0,narendramodi state level president knows polic...,-0.6369


In [None]:
# Categorizing sentiments based on the compound score
df['Sentiment'] = df['Sentiment_Score'].apply(lambda score: 'Positive' if score > 0 else ('Negative' if score < 0 else 'Neutral'))
df.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Retweets,Cleaned_Tweet,Sentiment_Score,Sentiment
0,0,2023-03-29 15:42:36+00:00,AnandPatni8,@vinodkapri @RahulGandhi Respected Indian Citi...,0.0,0.0,rahulgandhi respected indian citizens namaskaa...,0.6705,Positive
1,1,2023-03-29 15:42:05+00:00,dhinamum,"*Respected Indian Citizens,* Namaskaar I Am Th...",0.0,0.0,respected indian citizens namaskaar original g...,0.6705,Positive
2,2,2023-03-29 15:34:29+00:00,PrincetonCGI,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,0.0,0.0,1 n meet filmmaker prakash jha new jersey talk...,0.5267,Positive
3,3,2023-03-29 15:31:43+00:00,RishiJoeSanu,@MrinalWahal Why would politicians stop using ...,0.0,0.0,would politicians stop using religion politics...,-0.296,Negative
4,4,2023-03-29 15:26:48+00:00,itweetsensee,@annamalai_k @narendramodi A state level presi...,0.0,0.0,narendramodi state level president knows polic...,-0.6369,Negative


In [None]:
# df.to_csv('tweets_cleaned.csv', index=False)

In [None]:
# Displaying positive and negative tweets
positive_tweets = df[df['Sentiment'] == 'Positive']['Tweet']
negative_tweets = df[df['Sentiment'] == 'Negative']['Tweet']
neutral_tweets = df[df['Sentiment'] == 'Neutral']['Tweet']

print("Positive Tweets:")
print(positive_tweets)

print("\nNegative Tweets:")
print(negative_tweets)

print("\nNeutral Tweets:")
print(neutral_tweets)


Positive Tweets:
0        @vinodkapri @RahulGandhi Respected Indian Citi...
1        *Respected Indian Citizens,* Namaskaar I Am Th...
2        1/n-Meet Filmmaker Prakash Jha in New Jersey t...
11       As someone who is passionate about Indian poli...
13       AmitShah News18 Rising India never fails to pr...
                               ...                        
49987    Rajneethi celebrates 10 years of Excellence in...
49993    @ashajadeja325 Intellectuals can get printed a...
49996    @kaushikcbasu Cong ruled for 60 +yrs at center...
49998    @Schandillia A true leader puts the team in fr...
49999    Now' Rahul Gandhi is Mahanayak of\nINDIA. Rega...
Name: Tweet, Length: 24832, dtype: object

Negative Tweets:
3        @MrinalWahal Why would politicians stop using ...
4        @annamalai_k @narendramodi A state level presi...
5        @IAMCouncil @POTUS @Ilhan @bridgeinit OK this ...
6        @darrengrimes_ Why are pakistani /indian dual ...
7        @PeterStefanovi2 Why are paki

In [None]:
np.random.seed(0)

train_data, test_data, train_labels, test_labels = train_test_split(
    df['Cleaned_Tweet'], df['Sentiment'], test_size=0.2, random_state=42
)

# Tokenizing the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

# Padding sequences
max_len = 50
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')



In [None]:
np.random.seed(0)

# OHL Encoding
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

train_labels_onehot = to_categorical(train_labels_encoded, num_classes=3)
test_labels_onehot = to_categorical(test_labels_encoded, num_classes=3)


In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 classes: Positive, Negative, Neutral

optimizer = Adam(learning_rate=1e-4)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Training the model
batch_size = 32
epochs = 20
model.fit(train_padded, train_labels_onehot, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 33ms/step - accuracy: 0.4999 - loss: 1.0076 - val_accuracy: 0.5830 - val_loss: 0.8257
Epoch 2/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 36ms/step - accuracy: 0.6143 - loss: 0.7762 - val_accuracy: 0.6579 - val_loss: 0.7254
Epoch 3/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 35ms/step - accuracy: 0.7010 - loss: 0.6574 - val_accuracy: 0.6948 - val_loss: 0.6791
Epoch 4/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 36ms/step - accuracy: 0.7635 - loss: 0.5674 - val_accuracy: 0.7220 - val_loss: 0.6465
Epoch 5/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37ms/step - accuracy: 0.8029 - loss: 0.4843 - val_accuracy: 0.7574 - val_loss: 0.6068
Epoch 6/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.8433 - loss: 0.4131 - val_accuracy: 0.7680 - val_loss: 0.5972
Epoc

<keras.src.callbacks.history.History at 0x146b59c9cd0>

The performance of machine learning models can be influenced by various factors, and improvements in performance might be attributed to changes in model architecture, data preprocessing, or hyperparameter tuning. The reasons to why the modified model might be performing better:

1. **Model Architecture:**
   - The modified model uses a different architecture with an additional dense layer with 3 units (one for each sentiment class) and a softmax activation function. This architecture is more suitable for multi-class classification compared to a binary classification problem.

2. **Loss Function and Activation Function:**
   - The use of `categorical_crossentropy` loss and `softmax` activation in the output layer is appropriate for multi-class classification problems. This might contribute to better training for sentiment classes.

3. **Label Encoding:**
   - In the modified model, labels are encoded using one-hot encoding (`to_categorical`). This ensures that the model understands the categorical nature of the target variable, which can be important for improving performance in multi-class scenarios.

4. **Data Preprocessing:**
   - Tokenization and padding might be better configured in the modified model, leading to improved representation of the input text data.

5. **Hyperparameter Tuning:**
   - The modified model may have more suitable hyperparameters for the specific sentiment classification task. The learning rate, dropout rates, and other hyperparameters can significantly impact model performance.

6. **Randomness in Initialization:**
   - Neural networks are sensitive to weight initialization. It's possible that the random initialization of weights in the modified model led to better convergence during training.

7. **Training Duration:**
   - The number of epochs and batch size can influence model training. The modified model may have been trained for an optimal number of epochs and batch size.

8. **Data Splitting:**
   - The way the data is split into training and testing sets can also impact model performance. The randomness in the splitting might lead to different training and testing datasets.

It's important to note that the effectiveness of different models can vary based on the characteristics of the data and the task at hand. Experimenting with different architectures, hyperparameters, and preprocessing techniques is a common approach in machine learning to find the best-performing model for a specific task.

In [None]:
# Evaluating the model on the test set
predictions_onehot = model.predict(test_padded)
predictions_labels = label_encoder.inverse_transform(predictions_onehot.argmax(axis=1))
predictions_labels

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step


array(['Negative', 'Positive', 'Neutral', ..., 'Positive', 'Negative',
       'Positive'], dtype=object)

In [None]:
# Evaluating the model
accuracy = accuracy_score(test_labels, predictions_labels)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(test_labels, predictions_labels))

Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.82      0.80      3287
     Neutral       0.81      0.80      0.80      1695
    Positive       0.90      0.87      0.88      5018

    accuracy                           0.84     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.84      0.84      0.84     10000



In [None]:
# Example of making predictions on a new tweet
new_tweet = "Our government should have done a thorough study before implementing demonitisation. It was a useless plan."
new_tweet_sequence = tokenizer.texts_to_sequences([new_tweet])
new_tweet_padded = pad_sequences(new_tweet_sequence, maxlen=max_len, padding='post')

# Making predictions on the new tweet
new_tweet_prediction_onehot = model.predict(new_tweet_padded)
new_tweet_prediction_label = label_encoder.inverse_transform(new_tweet_prediction_onehot.argmax(axis=1))

print("Prediction for the New Tweet:")
print(f"Predicted Label: {new_tweet_prediction_label[0]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Prediction for the New Tweet:
Predicted Label: Negative
