# Building a Basic ML model for Text Classification

We'd be using ML model to classify whether a particular tweet is hate speech or not

In [31]:
# Import Libraries
import pandas as pd 

import re

from wordcloud import STOPWORDS

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [12]:
# import our data
tweets = pd.read_csv(r'C:\Users\hp\Documents\PYTHON\Tensorflow NLP\Tweets for NLP-ML.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


The label column is our target and the tweet is our feature column

We are going to be trying the Logistic regression and GaussianNB algorithm to train our model 

First, let's do some data cleaning and feature engineering

In [14]:
for index, tweet in enumerate(tweets['tweet'].head(10)):
    print(str(index+1) + ". " + tweet)

1.  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
2. @user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
3.   bihday your majesty
4. #model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  
5.  factsguide: society now    #motivation
6. [2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  
7.  @user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
8. the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams   #hate #imagine #actorslife #revolutionschool #girl
9. we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â¦ 
10.  @user @user welcome here !  i'm   it's so #gr8 ! 


In [16]:
# Remove noise like symbols, non alphabets

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.lower()

    return text

In [17]:
tweets['cleaned_text'] = tweets.tweet.apply(lambda x: clean_text(x))

In [18]:
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [27]:
# Feature engineering function

# Word Freq
def freq(text):
    word_list = []

    for tw_word in text.split():
        word_list.extend(tw_word)

    word_freq = pd.Series(word_list).value_counts()

    word_freq = word_freq.drop(STOPWORDS, errors='ignore')
    return word_freq



# Negation Term in text
def neg(words):
    pattern = r"\wn't"
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(pattern, word):
            return 1
    else:
        return 0



# Rare word in text
def rare(words, rare_50):
    for word in words:
        if word in rare_50:
            return 1
    else:
        return 0



#Questions in text
def quest(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0


In [29]:
word_freq = freq(tweets.cleaned_text.str)
#50 most rare words
rare_50 = tweets[-50:]
#Number of word in tweet
tweets['word_count'] = tweets.cleaned_text.str.split().apply(lambda x: len(x))
#Negation present or not
tweets['neg'] = tweets.cleaned_text.str.split().apply(lambda x: neg(x))
#Rare word present or not
tweets['rare_'] = tweets.cleaned_text.str.split().apply(lambda x: rare(x, rare_50))
#Questions present or not
tweets['quest'] = tweets.cleaned_text.str.split().apply(lambda x: quest(x))


In [30]:
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_text,word_count,neg,rare_,quest
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,0,1
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,1,0,0
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0


In [33]:
tweets.shape

(5242, 8)

The ML

Split data into train and test


In [35]:
X = tweets[['word_count', 'neg', 'rare_', 'quest']]
y = tweets.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

In [36]:
model = LogisticRegression()
model = model.fit(X_train,y_train)
pred = model.predict(X_test)

In [37]:
print("Accuracy:", accuracy_score(y_test,pred)*100, "%")

Accuracy: 59.37698664971393 %


In [38]:
model2 = GaussianNB()
model2 = model2.fit(X_train,y_train)
pred2 = model2.predict(X_test)

In [40]:
print("Accuracy:", accuracy_score(y_test,pred2)*100, "%")

Accuracy: 59.05912269548633 %
