In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import re
import spacy
nlp=spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])

**Loading and Exploring Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read CSV file
df = pd.read_csv('/content/drive/MyDrive/NLP/Twitter sentiment analysis/tweets_hate.csv')
print('Shape=>',df.shape)
df.head()

Shape=> (31962, 3)


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [None]:
#sample tweets
df['tweet'].sample(10)

17348                                                                                      think about it. #motivation #sunday   
26808          thank you  ð santa ð
 , for the best christmas present ever #cdnpoli #abpoli #yyc #yeg #canada #ottawa @user 
8682                                                                                                  both   &amp; #dumb #pelosi 
2430                casually dancing like a dick to sorry by justin bieber ð is it sad i know the moves of by hea ð   #mood
19846               love #lunch at @user ðð #marbs #marbella #spain #oceanclub #daypay #summer #lifestyle #burger   #smile 
23713                                                         can't wait for the russians to give me a good kicking on thursday  
9465      @user still can't believe these babies are 6 months old this week! ð»   #furriday #cute #cats #kitten #catsoftwitter 
18864                                                            let's follow this very ni

In [None]:
# class distribution in percentage
df['label'].value_counts(normalize = True)*100

0    92.98542
1     7.01458
Name: label, dtype: float64

**Text Cleaning**

In [None]:
#define a function for text cleaning
def text_cleaner(text):
  text = re.sub(r'@[A-Za-z0-9]+','',text)
  text = re.sub(r'#[A-Za-z0-9]+','',text)
  text = re.sub(r'http\S+', '', text)
  text = text.lower()
  text = re.sub("[^a-z]+", " ", text)
  text=re.sub("[\s]+"," ",text)

  # creating doc object
  doc=nlp(text)

  # remove stopwords and lemmatize the text
  tokens=[token.lemma_ for token in doc if(token.is_stop==False)]

  #join tokens by space
  return " ".join(tokens)

In [None]:
#cleaning the tweets column
df['cleaned_text']= df['tweet'].apply(text_cleaner)



In [None]:
# x - dependent variable , y - target variable
x   = df['cleaned_text'].values
y = df['label'].values

x[:10]

array(['  father dysfunctional selfish drags kids dysfunction',
       '  thanks credit t use cause don t offer wheelchair vans pdx',
       '  bihday majesty', '  love u u time ur', '  factsguide society',
       '  huge fan fare big talking leave chaos pay disputes',
       '  camping tomorrow danny', 'school year year exams t think',
       'won love land', '  welcome m s'], dtype=object)

In [None]:
y[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0])

**TRAIN-TEST SPLIT**

In [None]:
from sklearn.model_selection import train_test_split

#splitting into test and training set

x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y,test_size = 0.25, random_state=1,shuffle=True)

In [None]:
print('x_train:',x_train.shape,'y_train:',y_train.shape)
print('x_test:',x_test.shape,'y_test:',y_test.shape)

x_train: (23971,) y_train: (23971,)
x_test: (7991,) y_test: (7991,)


**Feature Engineering (TF-IDF Vectors are used)**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(max_features=1000)

#fitting the training set
word_vectorizer.fit(x_train)

In [None]:
# create TF-IDF vectors for Train Set
train_word_features = word_vectorizer.transform(x_train)
train_word_features

<23971x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 68786 stored elements in Compressed Sparse Row format>

In [None]:
# create TF-IDF vectors for Testing Set
test_word_features = word_vectorizer.transform(x_test)
test_word_features

<7991x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 22320 stored elements in Compressed Sparse Row format>

**Machine Learning Algorithms Implementation (Naive Bayes and Logistic regression)**

Naive Bayes

In [None]:
#Importing models
from sklearn.naive_bayes import MultinomialNB

#model training
nb_model=MultinomialNB().fit(train_word_features,y_train)
nb_model

In [None]:
#prediction for train set
train_pred_nb=nb_model.predict(train_word_features)
train_pred_nb

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
#Evaluating the model for training set
from sklearn.metrics import f1_score , accuracy_score

print("F1-score on Train Set:",f1_score(y_train,train_pred_nb,average="weighted"),
      "Accuracy on Train set:",accuracy_score(y_train,train_pred_nb,))

F1-score on Train Set: 0.9223115066159836 Accuracy on Train set: 0.9408034708606232


In [None]:
#  predictions for testing set
test_pred_nb=nb_model.predict(test_word_features)
test_pred_nb

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Evaluating on Validation Set
print("F1-score on Testing set:",f1_score(y_test,test_pred_nb,average="weighted"),
      "Accuracy on Testing set:",accuracy_score(y_test,test_pred_nb))

F1-score on Testing set: 0.9198036871226718 Accuracy on Testing set: 0.9393067200600675


**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Training model
lr_model=LogisticRegression().fit(train_word_features,y_train)
lr_model

In [None]:
# Make predictions for train set
train_pred_lr=lr_model.predict(train_word_features)
train_pred_nb

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Evaluating on Training Set
print("F1-score on Train Set:",f1_score(y_train,train_pred_lr,average="weighted"),
      "Accuracy on Train set:",accuracy_score(y_train,train_pred_lr))

F1-score on Train Set: 0.928217855062932 Accuracy on Train set: 0.9437653831713321


In [None]:
# Make predictions for validation set
test_pred_lr=lr_model.predict(test_word_features)
test_pred_lr

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Evaluating on Validation Set
print("F1-score on Testing Set:",f1_score(y_test,test_pred_lr,average="weighted"),
      "Accuracy on Testing set:",accuracy_score(y_test,test_pred_lr))

F1-score on Testing Set: 0.9269500210527966 Accuracy on Testing set: 0.9420598172944563
