In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# Training data
anger_train = pd.read_csv('./datasets/anger_train.txt', sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
fear_train = pd.read_csv('./datasets/fear_train.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
joy_train = pd.read_csv('./datasets/joy_train.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
sadness_train = pd.read_csv('./datasets/sadness_train.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])

In [3]:
anger_train.head(3)

Unnamed: 0,index,tweet,Emotion,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896


In [7]:
# Validation data
anger_dev_with_intensity = pd.read_csv('./datasets/anger_dev_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
fear_dev_with_intensity = pd.read_csv('./datasets/fear_dev_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
joy_dev_with_intensity = pd.read_csv('./datasets/joy_dev_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
sadness_dev_with_intensity = pd.read_csv('./datasets/sadness_dev_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])


anger_dev = pd.read_csv('./datasets/anger_dev.txt',  sep='\t', names=['tweet', 'Emotion','score'])
anger_dev.reset_index(drop=False, inplace=True)
anger_dev.drop('score', axis=1, inplace=True)

fear_dev = pd.read_csv('./datasets/fear_dev.txt',  sep='\t', names=['tweet', 'Emotion','score'])
fear_dev.reset_index(drop=False, inplace=True)
fear_dev.drop('score', axis=1, inplace=True)

joy_dev = pd.read_csv('./datasets/joy_dev.txt',  sep='\t', names=['tweet', 'Emotion','score'])
joy_dev.reset_index(drop=False, inplace=True)
joy_dev.drop('score', axis=1, inplace=True)

sadness_dev = pd.read_csv('./datasets/sadness_dev.txt',  sep='\t', names=['tweet', 'Emotion','score'])
sadness_dev.reset_index(drop=False, inplace=True)
sadness_dev.drop('score', axis=1, inplace=True)

In [8]:
anger_dev_with_intensity.head(3)

Unnamed: 0,index,tweet,Emotion,score
0,10941,At the point today where if someone says somet...,anger,0.319
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,0.144
2,10943,This game has pissed me off more than any othe...,anger,0.898


In [9]:
anger_dev.head(3)

Unnamed: 0,index,tweet,Emotion
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger
1,10858,@ArcticFantasy I would have almost took offens...,anger
2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger


In [11]:
# Test data
anger_test_with_intensity = pd.read_csv('./datasets/anger_test_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
fear_test_with_intensity = pd.read_csv('./datasets/fear_test_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
joy_test_with_intensity = pd.read_csv('./datasets/joy_test_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])
sadness_test_with_intensity = pd.read_csv('./datasets/sadness_test_w.txt',  sep='\t', names=['index', 'tweet', 'Emotion', 'score'])


anger_test = pd.read_csv('./datasets/anger_test.txt',  sep='\t', names=['index', 'tweet', 'Emotion','score'])
anger_test.drop('score', axis=1, inplace=True)

fear_test = pd.read_csv('./datasets/fear_test.txt',  sep='\t', names=['index', 'tweet', 'Emotion','score'])
fear_test.drop('score', axis=1, inplace=True)

joy_test = pd.read_csv('./datasets/joy_test.txt',  sep='\t', names=['index', 'tweet', 'Emotion','score'])
joy_test.drop('score', axis=1, inplace=True)

sadness_test = pd.read_csv('./datasets/sadness_test.txt',  sep='\t', names=['index','tweet', 'Emotion','score'])
sadness_test.drop('score', axis=1, inplace=True)

In [12]:
anger_test_with_intensity.head(3)

Unnamed: 0,index,tweet,Emotion,score
0,10941,At the point today where if someone says somet...,anger,0.319
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,0.144
2,10943,This game has pissed me off more than any othe...,anger,0.898


In [13]:
anger_test.head(3)

Unnamed: 0,index,tweet,Emotion
0,10941,At the point today where if someone says somet...,anger
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger
2,10943,This game has pissed me off more than any othe...,anger


**Linear Regression**

In [14]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return ' '.join(filtered_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
anger_train['tweet'] = anger_train['tweet'].apply(preprocess_text)
fear_train['tweet'] = fear_train['tweet'].apply(preprocess_text)
joy_train['tweet'] = joy_train['tweet'].apply(preprocess_text)
sadness_train['tweet'] = sadness_train['tweet'].apply(preprocess_text)


anger_dev_with_intensity['tweet'] = anger_dev_with_intensity['tweet'].apply(preprocess_text)
fear_dev_with_intensity['tweet'] = fear_dev_with_intensity['tweet'].apply(preprocess_text)
joy_dev_with_intensity['tweet'] = joy_dev_with_intensity['tweet'].apply(preprocess_text)
sadness_dev_with_intensity['tweet'] = sadness_dev_with_intensity['tweet'].apply(preprocess_text)

anger_dev['tweet'] = anger_dev['tweet'].apply(preprocess_text)
fear_dev['tweet'] = fear_dev['tweet'].apply(preprocess_text)
joy_dev['tweet'] = joy_dev['tweet'].apply(preprocess_text)
sadness_dev['tweet'] = sadness_dev['tweet'].apply(preprocess_text)


anger_test_with_intensity['tweet'] = anger_test_with_intensity['tweet'].apply(preprocess_text)
fear_test_with_intensity['tweet'] = fear_test_with_intensity['tweet'].apply(preprocess_text)
joy_test_with_intensity['tweet'] = joy_test_with_intensity['tweet'].apply(preprocess_text)
sadness_test_with_intensity['tweet'] = sadness_test_with_intensity['tweet'].apply(preprocess_text)

anger_test['tweet'] = anger_test['tweet'].apply(preprocess_text)
fear_test['tweet'] = fear_test['tweet'].apply(preprocess_text)
joy_test['tweet'] = joy_test['tweet'].apply(preprocess_text)
sadness_test['tweet'] = sadness_test['tweet'].apply(preprocess_text)

In [16]:
anger_train['tweet'][1]

'indian uber driver called someone n word wasnt moving vehicle id jumped disgusted'

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import math

emotions = ['anger', 'fear', 'joy', 'sadness']
models = {}

for emotion in emotions:
    train_data = globals()[f"{emotion}_train"]
    train_data['clean_tweet'] = train_data['tweet'].apply(preprocess_text)

    val_data = globals()[f"{emotion}_dev_with_intensity"]
    val_data['clean_tweet'] = val_data['tweet'].apply(preprocess_text)

    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train = tfidf_vectorizer.fit_transform(train_data['clean_tweet'])
    y_train = train_data['score']
    X_val = tfidf_vectorizer.transform(val_data['clean_tweet'])
    y_val = val_data['score']

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    models[emotion] = lr_model

    y_pred_val = lr_model.predict(X_val)
    rmse_val = math.sqrt(mean_squared_error(y_val, y_pred_val))
    print(f"Emotion: {emotion}, Root Mean Squared Error on Validation Set: {rmse_val}")

Emotion: anger, Root Mean Squared Error on Validation Set: 0.23503300363514398
Emotion: fear, Root Mean Squared Error on Validation Set: 0.20989997331548327
Emotion: joy, Root Mean Squared Error on Validation Set: 0.20467512407838678
Emotion: sadness, Root Mean Squared Error on Validation Set: 0.25977873608509977


In [18]:
emotions = ['anger', 'fear', 'joy', 'sadness']
models = {}

for emotion in emotions:
    train_data = globals()[f"{emotion}_train"]
    train_data['clean_tweet'] = train_data['tweet'].apply(preprocess_text)

    test_data = globals()[f"{emotion}_test_with_intensity"]
    test_data['clean_tweet'] = test_data['tweet'].apply(preprocess_text)

    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train = tfidf_vectorizer.fit_transform(train_data['clean_tweet'])
    y_train = train_data['score']
    X_test = tfidf_vectorizer.transform(test_data['clean_tweet'])
    y_test = test_data['score']

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    models[emotion] = lr_model

    y_pred_test = lr_model.predict(X_test)
    rmse_test = math.sqrt(mean_squared_error(y_test, y_pred_test))
    print(f"Emotion: {emotion}, Root Mean Squared Error on Test Set: {rmse_test}")

Emotion: anger, Root Mean Squared Error on Test Set: 0.23503300363514398
Emotion: fear, Root Mean Squared Error on Test Set: 0.20989997331548327
Emotion: joy, Root Mean Squared Error on Test Set: 0.2265382193594594
Emotion: sadness, Root Mean Squared Error on Test Set: 0.2700066327077875
