## Emotion Classification Project COMP544DL

### Loucas Sialis, Neophytos Petrou, Konstantinos Telioglanidis

### Instructions: Add this ipynb file in the same location folder as the train.csv and test.csv files and run all the cells in the notebook. The result should be a predictions.txt file in the same folder.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    t = text.split()
    t = [s.lower() for s in t] # Lowercasing
    t = ' '.join(t)
    
    t = re.sub(r"\baren'?t?\b", "are not", t) # arent/ aren/ aren't become 'are not'

    t = re.sub(r"\bcouldn'?t?\b", "could not", t)

    t = re.sub(r"\bdon'?t?\b", "do not", t)

    t = re.sub(r"\bdoesn'?t?\b", "does not", t)

    t = re.sub(r"\bdidn'?t?\b", "did not", t)

    t = re.sub(r"\bhadn'?t?\b", "had not", t)

    t = re.sub(r"\bhasn'?t?\b", "has not", t)

    t = re.sub(r"\bhaven'?t?\b", "have not", t)

    t = re.sub(r"\bisn'?t?\b", "is not", t)

    t = re.sub(r"\bshouldn'?t?\b", "should not", t)

    t = re.sub(r"\bwasn'?t?\b", "was not", t)

    t = re.sub(r"\bweren'?t?\b", "were not", t)

    t = re.sub(r"\bwon'?t?\b", "will not", t)

    t = re.sub(r"\bwouldn'?t?\b", "would not", t)
    

    
    stop_words = stopwords.words('english') + ['www','http','gonna','theyre','etc',
                                          'theres','hes','thats','youre','gotta',
                                          'wanna','whats','alot','img','cuz','shes',
                                           'bbq','bc','weve','ive','href','kinda'] 
    stop_words = list(set(stop_words) - {'more','but','very','no','not','too'})            # Creating the stopword list
    
    t = t.split()
    t = [s for s in t if not s in stop_words] #remove stopwords
    t = ' '.join(t) 
    
    t = re.sub('\W',r' ',t) #remove punctuation
    t = re.sub('\d',r' ',t) #remove numbers
    
    t = re.sub(r'\b[a-zA-Z]\b','', t) #remove isolated letters 
    t = re.sub(r'\b\w{2}\b','',t) #remove 2 letter words
    
    t = t.split()
    t = [lemmatizer.lemmatize(s,pos='n') for s in t] # Lemmatize nouns
    t = ' '.join(t)
 
    return t

In [4]:
def train_test(train_dataset,test_dataset):
    
    #Training Dataset
    train_data = pd.read_csv(train_dataset)
    X = train_data["text"].apply(preprocess)
    y = train_data["emotion"]#Apply Preprocess
    
    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 2)) #Vector representation 
    
    rf_clf = RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=2,
        random_state=42
    )
    
    svm_clf = LinearSVC(
        C=0.2,
        class_weight='balanced',
        penalty='l1',
        dual=False,
        max_iter=10000
    )
    
    log_clf = LogisticRegression(
        C=0.9,
        penalty='l1',
        solver='liblinear',
        multi_class='ovr',
        max_iter=1000
    )
    
    nb_clf = ComplementNB(
        fit_prior= False,
        norm= True
    )
    
    voting_clf = VotingClassifier(
        estimators=[
        ('rf', rf_clf),
        ('svm', svm_clf),
        ('log', log_clf),
        ('complement_bayes', nb_clf)
        ],
        voting='hard'  
    )
    
    pipeline = Pipeline([
        ('bow', vectorizer),
        ('voting', voting_clf)
    ])
    
    pipeline.fit(X, y)
    

    #Testing Dataset
    test_data = pd.read_csv(test_dataset)
    test_data['text'] = test_data['text'].apply(preprocess)
    
    predictions = pipeline.predict(test_data['text'])
    return pd.DataFrame(predictions).to_csv('predictions.txt', index=False, header=False)

In [None]:
# PREDICT NEW DATA
train_test('train.csv','test.csv')