In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample
import pickle

In [None]:
#  Load your dataset
df = pd.read_csv("Tweets123456.csv")
print("Columns:", df.columns.tolist())

Columns: ['textID', 'text', 'selected_text', 'sentiment', 'usernames', 'userID', 'date']


In [5]:
dataset = pd.read_csv("Tweets123456.csv" , encoding= 'ISO-8859-1')

In [6]:
dataset.head()

Unnamed: 0,textID,text,selected_text,sentiment,usernames,userID,date
0,cb774db0d1,"I`d have responded, if I were going . I'd no...","I`d have responded, if I were going. I'd no ...",neutral,John,1.0,Mon Apr 06 22:19:45 PDT 2009
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,jack,2.0,Mon Apr 06 22:19:53 PDT 2009
2,088c60f138,my boss is bullying me...,bullying me,negative,user123,3.0,Mon Apr 06 22:19:53 PDT 2009
3,9642c003ef,what interview! leave me alone,leave me alone,negative,chrisdoe,4.0,Mon Apr 06 22:19:57 PDT 2009
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,mike89,5.0,Mon Apr 06 22:19:57 PDT 2009


In [None]:
#  Clean the text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove links
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [8]:
col_names = ['textId', 'text', 'selected_text', 'sentiment', 'usernames'] 

In [9]:
dataset.shape

(27481, 7)

In [10]:
#checking for missing values
dataset.isnull().sum()

textID               0
text                 1
selected_text        1
sentiment            0
usernames        27432
userID           27430
date             27472
dtype: int64

In [11]:
# Distribution of tweets
dataset['sentiment'].value_counts()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

In [None]:
#  Encode sentiments (3 labels)
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['sentiment'].str.lower().str.strip()
df = df[df['sentiment'].isin(sentiment_map.keys())]
df['label'] = df['sentiment'].map(sentiment_map)

print("\nLabel distribution before balancing:")
print(df['label'].value_counts())


Label distribution before balancing:
label
1    11118
2     8582
0     7781
Name: count, dtype: int64


In [None]:
#  Balance classes (optional but helps detect neutral better)
max_count = df['label'].value_counts().max()
balanced_df = pd.concat([
    resample(group, replace=True, n_samples=max_count, random_state=42)
    for _, group in df.groupby('label')
])

print("\nLabel distribution after balancing:")
print(balanced_df['label'].value_counts())


Label distribution after balancing:
label
0    11118
1    11118
2    11118
Name: count, dtype: int64


In [None]:
#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['clean_text'], balanced_df['label'], test_size=0.2, random_state=42, stratify=balanced_df['label']
)


In [None]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:

#  Train the Model
model = LogisticRegression(max_iter=400)
model.fit(X_train_vec, y_train)

In [None]:
#  Evaluate
y_pred = model.predict(X_test_vec)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


Accuracy: 0.7588067755958627

Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.77      0.78      2223
     neutral       0.67      0.71      0.69      2224
    positive       0.83      0.79      0.81      2224

    accuracy                           0.76      6671
   macro avg       0.76      0.76      0.76      6671
weighted avg       0.76      0.76      0.76      6671



In [None]:
# Save model and vectorizer
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("\n✅ Model and vectorizer saved successfully!")
print("Model classes_:", model.classes_)


✅ Model and vectorizer saved successfully!
Model classes_: [0 1 2]
