# Public Policy Sentiment Analysis â€” CoWIN Twitter Dataset

In [None]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Load dataset
df = pd.read_csv('data/cowin_processed.csv', parse_dates=['timestamp'])
print(df.shape)

# Clean text
def clean_text(s):
    s = str(s)
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'@\w+', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s.lower()

df['clean_text'] = df['text'].astype(str).apply(clean_text)

# Split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment_label'], random_state=42)
X_train, y_train = train_df['clean_text'], train_df['sentiment_label']
X_test, y_test = test_df['clean_text'], test_df['sentiment_label']

# Logistic Regression
pipe_lr = Pipeline([('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=5)), ('clf', LogisticRegression(max_iter=1000, solver='saga', C=1.0, class_weight='balanced'))])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

os.makedirs('models', exist_ok=True)
with open('models/tfidf_lr.pkl', 'wb') as f:
    pickle.dump(pipe_lr, f)

# LSTM
MAX_WORDS=30000; MAX_LEN=120
from sklearn.preprocessing import LabelEncoder
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)
le = LabelEncoder(); y_train_enc=le.fit_transform(y_train); y_test_enc=le.transform(y_test)
model = Sequential([Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN), Bidirectional(LSTM(128)), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.2), Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
mc = ModelCheckpoint('models/lstm_model.h5', monitor='val_loss', save_best_only=True)
model.fit(X_train_seq, y_train_enc, validation_split=0.1, epochs=5, batch_size=256, callbacks=[es, mc])
y_pred_lstm = (model.predict(X_test_seq)>0.5).astype(int).flatten()
print(classification_report(y_test_enc, y_pred_lstm))
with open('models/tokenizer_lstm.pkl','wb') as f: pickle.dump(tokenizer,f)
with open('models/label_encoder.pkl','wb') as f: pickle.dump(le,f)
results = pd.DataFrame({'Model':['Logistic Regression','LSTM'], 'Accuracy':[accuracy_score(y_test, y_pred_lr), accuracy_score(y_test_enc,y_pred_lstm)]})
print(results)
results.to_csv('models/model_results.csv', index=False)