In [110]:
# importing the necessary libraries
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download("punkt_tab")
nltk.download("wordnet")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,roc_auc_score
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HomePC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HomePC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [111]:
# loading the train data
df=pd.read_csv("twitter_training.csv")
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [112]:
# summarizing the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [113]:
# droping unnecessary columns
df.drop("2401",axis=1,inplace=True)

In [114]:
# renaming columns
df.rename(columns={"Borderlands":"entity","Positive":"Sentiment","im getting on borderlands and i will murder you all ,":"tweet"},inplace=True)

In [115]:
# checking for missing values in the train data 
df.isna().sum()

entity         0
Sentiment      0
tweet        686
dtype: int64

In [116]:
# dropping missing values
df.dropna(inplace=True)

In [117]:
df.isna().sum()

entity       0
Sentiment    0
tweet        0
dtype: int64

In [118]:
# checking for duplicates
df.duplicated().sum()

3038

In [119]:
# dropping duplicates
df.drop_duplicates(inplace=True)

In [120]:
# df description
df.describe()

Unnamed: 0,entity,Sentiment,tweet
count,70957,70957,70957
unique,32,4,69490
top,CallOfDuty,Negative,It is not the first time that the EU Commissio...
freq,2304,21565,77


In [121]:
# loading the validation dataset
df_test=pd.read_csv("twitter_validation.csv")
df_test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [122]:
# summary of the dataset
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                                                                                                                                                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                              --------------  ----- 
 0   3364                                                                                                                                                                                                                                                999 non-null    int64 
 1   Facebook                                                                   

In [123]:
# dropping unnecessary columns
df_test.drop("3364",axis=1,inplace=True)

In [124]:
# renaming columns 
df_test.rename(columns={"Facebook":"entity","Irrelevant":"Sentiment","I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣":"tweet"},inplace=True)
df_test.head()

Unnamed: 0,entity,Sentiment,tweet
0,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Google,Neutral,Now the President is slapping Americans in the...
4,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


## *Data preprocessing*

In [125]:
# creating columns for number of characters, number of sentences and number of words
df["char"]=df["tweet"].apply(lambda x:len(x))
df_test["char"]=df_test["tweet"].apply(lambda x:len(x))
df["words"]=df["tweet"].apply(lambda x:nltk.word_tokenize(x)).apply(lambda x:len(x))
df_test["words"]=df_test["tweet"].apply(lambda x:nltk.word_tokenize(x)).apply(lambda x:len(x))
df["sentences"]=df["tweet"].apply(lambda x:nltk.sent_tokenize(x)).apply(lambda x:len(x))
df_test["sentences"]=df_test["tweet"].apply(lambda x:nltk.sent_tokenize(x)).apply(lambda x:len(x))

In [126]:
# new train df decsription
df.describe()

Unnamed: 0,char,words,sentences
count,70957.0,70957.0,70957.0
mean,111.084431,23.08143,1.965063
std,79.159036,17.126667,1.642409
min,1.0,0.0,0.0
25%,49.0,10.0,1.0
50%,93.0,19.0,1.0
75%,155.0,32.0,2.0
max,957.0,198.0,33.0


In [127]:
# cleaning the tweets using regular expression
def clean_data(text):
    text=re.sub("[^a-zA-Z]"," ",text) # retain only letters
    text=str.lower(text)# turn all the text to lower case 
    text=text.split(" ") # splitting string into list
    text=" ".join(text) # joining all the text in the list 
    return text

In [128]:
# applying the cleaning function to our dataframe
df["clean_text"]=df["tweet"].apply(lambda x:clean_data(x))
df_test["clean_text"]=df_test["tweet"].apply(lambda x:clean_data(x))

In [129]:
df.head()

Unnamed: 0,entity,Sentiment,tweet,char,words,sentences,clean_text
0,Borderlands,Positive,I am coming to the borders and I will kill you...,51,13,1,i am coming to the borders and i will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...,50,11,1,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...,51,11,1,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,57,13,1,im getting on borderlands and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...,53,11,1,im getting into borderlands and i can murder y...


In [130]:
# tokenizing the tweets
df["tokens"]=df["clean_text"].apply(lambda x:nltk.word_tokenize(x))
df_test["tokens"]=df_test["clean_text"].apply(lambda x:nltk.word_tokenize(x))
df_test.head()


Unnamed: 0,entity,Sentiment,tweet,char,words,sentences,clean_text,tokens
0,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,109,17,1,bbc news amazon boss jeff bezos rejects clai...,"[bbc, news, amazon, boss, jeff, bezos, rejects..."
1,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,91,20,2,microsoft why do i pay for word when it funct...,"[microsoft, why, do, i, pay, for, word, when, ..."
2,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",71,16,1,csgo matchmaking is so full of closet hacking ...,"[csgo, matchmaking, is, so, full, of, closet, ..."
3,Google,Neutral,Now the President is slapping Americans in the...,170,26,2,now the president is slapping americans in the...,"[now, the, president, is, slapping, americans,..."
4,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,268,62,1,hi eahelp i ve had madeleine mccann in my cel...,"[hi, eahelp, i, ve, had, madeleine, mccann, in..."


In [131]:
# lemmatizing the tokens 
df["lemmas"]=df["tokens"].apply(lambda x:[nltk.WordNetLemmatizer().lemmatize(token) for token in x]).apply(lambda x: " ".join(x))
df_test["lemmas"]=df_test["tokens"].apply(lambda x:[nltk.WordNetLemmatizer().lemmatize(token) for token in x]).apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,entity,Sentiment,tweet,char,words,sentences,clean_text,tokens,lemmas
0,Borderlands,Positive,I am coming to the borders and I will kill you...,51,13,1,i am coming to the borders and i will kill you...,"[i, am, coming, to, the, borders, and, i, will...",i am coming to the border and i will kill you all
1,Borderlands,Positive,im getting on borderlands and i will kill you ...,50,11,1,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k...",im getting on borderland and i will kill you all
2,Borderlands,Positive,im coming on borderlands and i will murder you...,51,11,1,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu...",im coming on borderland and i will murder you all
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,57,13,1,im getting on borderlands and i will murder ...,"[im, getting, on, borderlands, and, i, will, m...",im getting on borderland and i will murder you...
4,Borderlands,Positive,im getting into borderlands and i can murder y...,53,11,1,im getting into borderlands and i can murder y...,"[im, getting, into, borderlands, and, i, can, ...",im getting into borderland and i can murder yo...


In [133]:
# # changing the sparse matrix into a dataframe
# tf_df=pd.DataFrame(X.toarray(),columns=tfidf.get_feature_names_out(),index=df.index)
# tf_df_val=pd.DataFrame(X_val.toarray(),columns=tfidf.get_feature_names_out(),index=df_test.index)

In [138]:
#defining X
X=df["lemmas"]

In [139]:
# defining target
y=df["Sentiment"]
y_val=df_test["Sentiment"]

In [140]:
# modelling pipeline
pipe=Pipeline([("tfidf",TfidfVectorizer(stop_words="english")),
               ("model",RandomForestClassifier())])

In [141]:
# fitting the model
pipe.fit(X,y)

In [142]:
# defining X_val 
X_val=df_test["lemmas"]

In [143]:
# predicting X_val
val_pred=pipe.predict(X_val)
train_pred=pipe.predict(X)

In [144]:
# classification report
print(f"train {classification_report(y,train_pred)}")
print(f"validation {classification_report(y_val,val_pred)}")

train               precision    recall  f1-score   support

  Irrelevant       1.00      0.97      0.98     12446
    Negative       0.99      0.98      0.98     21565
     Neutral       1.00      0.97      0.98     17398
    Positive       0.94      0.99      0.96     19548

    accuracy                           0.98     70957
   macro avg       0.98      0.98      0.98     70957
weighted avg       0.98      0.98      0.98     70957

validation               precision    recall  f1-score   support

  Irrelevant       0.99      0.93      0.96       171
    Negative       0.94      0.97      0.95       266
     Neutral       0.95      0.95      0.95       285
    Positive       0.96      0.97      0.96       277

    accuracy                           0.96       999
   macro avg       0.96      0.95      0.96       999
weighted avg       0.96      0.96      0.96       999



In [148]:
# roc_auc scores 
y_train_score=pipe.predict_proba(X)
y_val_score=pipe.predict_proba(X_val)
print(f"train auc {roc_auc_score(y,y_train_score,multi_class='ovo')}, val score {roc_auc_score(y_val,y_val_score,multi_class='ovo')}")

train auc 0.99927070936292, val score 0.9968105520237917
