# Public Policy Sentiment Analysis — CoWIN Twitter Dataset

In [3]:
df = pd.read_csv('tweets_with_sentiment.csv')
df = df.rename(columns={'created_at': 'timestamp', 'predicted_sentiment_roberta': 'sentiment'})

# Filter first while 'lang' still exists
df = df[df['lang'] == 'en']

# Then keep relevant columns
df = df[['timestamp', 'text', 'sentiment', 'like_count', 'retweet_count']]
df['sentiment'] = df['sentiment'].str.lower()

In [4]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder

In [5]:
print("Dataset loaded successfully!")
print(df.shape)
print(df.head())

Dataset loaded successfully!
(470854, 5)
                  timestamp  \
0  2021-01-02T13:49:32.000Z   
1  2021-01-02T13:34:38.000Z   
2  2021-01-02T13:03:35.000Z   
3  2021-01-02T12:28:17.000Z   
4  2021-01-02T12:18:09.000Z   

                                                text sentiment  like_count  \
0  @user @user @user @user Hi, can you please sha...  positive           0   
1  @user Could not find #CoWIN #CowinApp on play ...  negative           0   
2                            @user @user Cowin lush!  positive           2   
3  <U+25B6><U+FE0F> #COVID19 Vaccine Dry Run Held...  positive           3   
4  Covid vaccination India dry run how to registe...  positive           0   

   retweet_count  
0              0  
1              0  
2              0  
3              0  
4              0  


In [6]:
def clean_text(s):
    s = str(s)
    s = re.sub(r'http\\S+', '', s)
    s = re.sub(r'@\\w+', '', s)
    s = re.sub(r'[^A-Za-z\\s]', '', s)  # remove emojis/punctuations
    s = re.sub(r'\\s+', ' ', s).strip()
    return s.lower()

df['clean_text'] = df['text'].astype(str).apply(clean_text)

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment'], random_state=42)

X_train, y_train = train_df['clean_text'], train_df['sentiment']
X_test, y_test = test_df['clean_text'], test_df['sentiment']

print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")

Training samples: 376683 | Testing samples: 94171


In [8]:
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=5)),
    ('clf', LogisticRegression(max_iter=1000, solver='saga', C=1.0, class_weight='balanced', random_state=42))
])

pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

print("\n=== Logistic Regression Report ===")
print(classification_report(y_test, y_pred_lr))

# Save model
os.makedirs('models', exist_ok=True)
with open('models/tfidf_lr.pkl', 'wb') as f:
    pickle.dump(pipe_lr, f)


=== Logistic Regression Report ===
              precision    recall  f1-score   support

    negative       0.30      1.00      0.47     10982
    positive       1.00      0.70      0.82     83189

    accuracy                           0.73     94171
   macro avg       0.65      0.85      0.64     94171
weighted avg       0.92      0.73      0.78     94171



In [None]:
MAX_WORDS = 30000
MAX_LEN = 120

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# Encode labels (convert positive/negative to 1/0)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
mc = ModelCheckpoint('models/lstm_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_split=0.1,
    epochs=5,
    batch_size=256,
    callbacks=[es, mc],
    verbose=1
)

y_pred_lstm = (model.predict(X_test_seq) > 0.5).astype(int).flatten()

print("\n=== LSTM Report ===")
print(classification_report(y_test_enc, y_pred_lstm))

# Save tokenizer & label encoder
with open('models/tokenizer_lstm.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)



Epoch 1/5
[1m 939/1325[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m4:47[0m 744ms/step - accuracy: 0.8831 - loss: 0.2769

In [None]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'BiLSTM'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test_enc, y_pred_lstm)
    ],
    'F1': [
        f1_score(y_test, y_pred_lr, pos_label='positive', average='binary'),
        f1_score(y_test_enc, y_pred_lstm)
    ]
})

print("\n=== Model Comparison ===")
print(results)

results.to_csv('models/model_results.csv', index=False)

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='Model', y='Accuracy', data=results)
plt.title('Model Accuracy Comparison')
plt.show()