In [1]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train.csv


In [13]:
# 📌 Step 1: Install dependencies (if not already installed)
!pip install nltk scikit-learn

# 📌 Step 2: Import libraries
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle




In [28]:

# 📌 Step 3: Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 📌 Step 4: Load your dataset
df = pd.read_csv("train.csv")  # uploaded via Colab
df = df[['tweet', 'label']]  # If column names differ, adjust here
df.columns = ['text', 'sentiment']
df.dropna(inplace=True)

# 📌 Step 5: Clean and preprocess tweets
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r"http\S+|@\w+|#\w+", "", text)  # remove URLs, mentions, hashtags
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(preprocess)

# 📌 Step 6: Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text']).toarray()
y = df['sentiment']

# 📌 Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Step 8: Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 📌 Step 9: Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 📌 Step 10: Save model and vectorizer
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("✅ Model and vectorizer saved!")

# 📌 Predict sentiment on test set and save to CSV
df_test = df.sample(100)  # Or however many you'd like
df_test['clean_text'] = df_test['text'].apply(preprocess)
X_test = tfidf.transform(df_test['clean_text'])
df_test['predicted_sentiment'] = model.predict(X_test)

# Save as CSV
df_test[['text', 'predicted_sentiment']].to_csv("test_data.csv", index=False)
print("✅ test_data.csv saved!")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Accuracy: 0.9418113561708118
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      5937
           1       0.86      0.22      0.35       456

    accuracy                           0.94      6393
   macro avg       0.90      0.61      0.66      6393
weighted avg       0.94      0.94      0.93      6393

✅ Model and vectorizer saved!
✅ test_data.csv saved!


In [30]:
with open("requirements.txt", "w") as f:
    f.write("nltk\nscikit-learn\npandas\n")
