In [2]:
# from textblob import TextBlob
from io import StringIO
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KUNAL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KUNAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
with open('train.txt/train.txt', 'r') as file:
        data = file.read()

In [16]:
data_io = StringIO(data)

In [17]:
texts = []
emotions = []

for line in data_io:
    line = line.strip()  
    if line:  
        parts = line.split(';')
        if len(parts) == 2:
            texts.append(parts[0])
            emotions.append(parts[1])
        else:
            print(f"Warning: Skipping malformed line - '{line}'")

In [18]:
df = pd.DataFrame({'text': texts, 'emotion': emotions})

In [19]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [20]:
emotion_counts = df['emotion'].value_counts()
print(emotion_counts)

emotion
joy           5362
sadness       4666
anger         2159
fear          1937
love          1304
suicidal      1025
depression    1000
surprise       967
Name: count, dtype: int64


In [21]:
grouped = df.groupby('emotion')

limited_df = pd.DataFrame()

for emotion, group in grouped:
    limited_group = group.head(950)
    limited_df = pd.concat([limited_df, limited_group])

limited_df = limited_df.reset_index(drop=True)

limited_df.head()

Unnamed: 0,text,emotion
0,im grabbing a minute to post i feel greedy wrong,anger
1,i am feeling grouchy,anger
2,i think it s the easiest time of year to feel ...,anger
3,i feel irritated and rejected without anyone d...,anger
4,i already feel like i fucked up though because...,anger


In [22]:
emotion_counts = limited_df['emotion'].value_counts()
print(emotion_counts)

emotion
anger         950
depression    950
fear          950
joy           950
love          950
sadness       950
suicidal      950
surprise      950
Name: count, dtype: int64


In [23]:
print(df)
print(df['emotion'].value_counts())

                                                    text   emotion
0                                i didnt feel humiliated   sadness
1      i can go from feeling so hopeless to so damned...   sadness
2       im grabbing a minute to post i feel greedy wrong     anger
3      i am ever feeling nostalgic about the fireplac...      love
4                                   i am feeling grouchy     anger
...                                                  ...       ...
18415  I thought it would be harder, but it was much ...  surprise
18416     I didnâ€™t think Iâ€™d be able to achieve this  surprise
18417  I didnâ€™t think Iâ€™d be able to finish this ...  surprise
18418  I thought it was going to be a nightmare, but ...  surprise
18419  I was so surprised by how easily everything ca...  surprise

[18420 rows x 2 columns]
emotion
joy           5362
sadness       4666
anger         2159
fear          1937
love          1304
suicidal      1025
depression    1000
surprise       967
Name: coun

In [24]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # lowercase the text and tokenize it
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    
    # Rejoin tokens back to string
    return ' '.join(tokens)

df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [25]:
df.head()

Unnamed: 0,text,emotion,preprocessed_text
0,i didnt feel humiliated,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,feeling grouchy


In [26]:
X = df['preprocessed_text']
y = df['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(14736,) (14736,) (3684,) (3684,)


In [28]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [31]:
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "Multinomial Naive Bayes": {
        'alpha': [0.1, 0.5, 1.0]
    }
}

In [30]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Multinomial Naive Bayes": MultinomialNB()
}
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    
    grid_search.fit(X_train_tfidf, y_train)  # Train with grid search

    # Get the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Predict on training and test sets
    y_train_pred = best_model.predict(X_train_tfidf)
    y_test_pred = best_model.predict(X_test_tfidf)

    # Print the best parameters and accuracies
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Best Parameters: {best_params}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")
    print("="*50)

Evaluating Logistic Regression...


NameError: name 'param_grids' is not defined

In [31]:
final_model = SVC(C=1, gamma='scale', kernel='linear')
final_model.fit(X_train_tfidf, y_train)

y_train_pred = final_model.predict(X_train_tfidf)
y_test_pred = final_model.predict(X_test_tfidf)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report

print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.9681
Test Accuracy: 0.8931
Classification Report:
              precision    recall  f1-score   support

       anger       0.89      0.85      0.87       432
  depression       0.96      1.00      0.98       200
        fear       0.84      0.86      0.85       388
         joy       0.87      0.93      0.90      1072
        love       0.80      0.76      0.78       261
     sadness       0.93      0.90      0.92       933
    suicidal       1.00      0.99      0.99       205
    surprise       0.90      0.77      0.83       193

    accuracy                           0.89      3684
   macro avg       0.90      0.88      0.89      3684
weighted avg       0.89      0.89      0.89      3684



In [40]:
def predict_emotion(text):
    processed_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([processed_text])
    return final_model.predict(text_tfidf)

# Test with a new entry
new_entry = ["I want to kill myself", "I am so happy", "I dont know what to do with my life anymore", "I did not expect you here", "I feel hopeless", "I wanna kill myself"]
for i in new_entry:
    print(predict_emotion(i))

['joy']
['joy']
['sadness']
['surprise']
['depression']
['joy']


In [100]:
import pickle
with open('emotion_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model saved.")

Model saved.
