#Emotion Classification from Tweets using SVM

In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("wordnet")
import re
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline

import joblib
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


dataset from kaggle

In [34]:

df=pd.read_csv("/content/emotions.csv")
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [35]:

df.isnull().sum()
df.duplicated().sum()

np.int64(686)

#preprocessing

In [36]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [17]:
df.dropna(inplace=True)
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4.0
1,ive enjoyed being able to slouch about relax a...,0.0
2,i gave up my internship with the dmrg and am f...,4.0
3,i dont know i feel so lost,0.0
4,i am a kindergarten teacher and i am thoroughl...,4.0
...,...,...
31037,im sure im not the only one who feels this way...,4.0
31038,i feel kind of spacey and rude,3.0
31039,i feel it worthwhile sharing this as there has...,1.0
31040,i feel like the bulletin boards in school heap...,0.0


In [37]:
df=df.drop(columns=["Unnamed: 0"],errors="ignore")
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [38]:
def clean_text(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [40]:
# Function for tokenization and lemmatization
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [41]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [42]:

df['cleaned_text'] = df['text'].apply(preprocess_text)

In [43]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:

# Train LinearSVC model
model = LinearSVC()
model.fit(X_train, y_train)

# Evaluate model

In [46]:


# Make predictions
y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# pd.DataFrame(classification_report)

Accuracy: 0.8883027936317213

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     24121
           1       0.91      0.92      0.91     28220
           2       0.76      0.75      0.76      6824
           3       0.90      0.89      0.89     11448
           4       0.83      0.84      0.83      9574
           5       0.70      0.71      0.71      3038

    accuracy                           0.89     83225
   macro avg       0.84      0.84      0.84     83225
weighted avg       0.89      0.89      0.89     83225



In [47]:

joblib.dump(model, 'svc_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [48]:


# Load the saved model and vectorizer
model = joblib.load('svc_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Create a pipeline
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [49]:

# Label to emotion mapping
label_to_emotion = {
    0: "sad",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"}

# prompt: code to evaluate ths with an  unseen data

In [50]:


# Evaluate with unseen data
unseen_data = ["This is a new happy sentence.", "I am feeling very sad today.", "That movie was surprisingly good!"]

# Preprocess unseen data
preprocessed_unseen_data = [preprocess_text(text) for text in unseen_data]

# Use the loaded pipeline to predict on preprocessed unseen data
unseen_predictions = pipeline.predict(preprocessed_unseen_data)

# Print predictions with corresponding emotions
print("\nPredictions on unseen data:")
for text, prediction in zip(unseen_data, unseen_predictions):
    predicted_emotion = label_to_emotion[prediction]
    print(f"Text: '{text}' -> Predicted Emotion: {predicted_emotion}")


Predictions on unseen data:
Text: 'This is a new happy sentence.' -> Predicted Emotion: joy
Text: 'I am feeling very sad today.' -> Predicted Emotion: sad
Text: 'That movie was surprisingly good!' -> Predicted Emotion: joy
