<h1 align='center'> Twitter Tweets Hate Speech Clasification

---

## Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

---

#### Data Loading

In [2]:
data = pd.read_csv('dataset/twitter_sentiments.csv')
data.head()

data.drop(['id'], inplace=True, axis=1)

In [3]:
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


#### Check for Null Values

In [4]:
data.isnull().sum()

label    0
tweet    0
dtype: int64

In [5]:
data.isnull().any()

label    False
tweet    False
dtype: bool

#### Describe

In [6]:
data.describe()

Unnamed: 0,label
count,31962.0
mean,0.070146
std,0.255397
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


#### Assigning X and Y to independent and the dependent Attribute

In [7]:
# Dependent Attribute // Target Attribute
Y = data['label']

# Independent Attribute
X = data.drop(['label'], axis=1)

#### Train Test Split

<pre>
We will stratify the data on the label column so that the distribution of the target label will be the same in both 
train and test data

In [8]:
train,test = train_test_split(data,test_size=0.2, stratify=Y, random_state=20)

In [9]:
train['label'].value_counts()

0    23775
1     1794
Name: label, dtype: int64

In [10]:
test['label'].value_counts()

0    5945
1     448
Name: label, dtype: int64

In [11]:
train.tweet.shape, train['label'].shape

((25569,), (25569,))

#### Applying TF-IDF 

In [12]:
tf_idf = TfidfVectorizer(lowercase=True,\
                       stop_words=ENGLISH_STOP_WORDS,\
                       max_features=1000)

tf_idf.fit(train.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [13]:
train_tf_idf = tf_idf.transform(train.tweet)
test_tf_idf = tf_idf.transform(test.tweet)

In [14]:
train_tf_idf

<25569x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 105576 stored elements in Compressed Sparse Row format>

In [15]:
train_tf_idf.shape

(25569, 1000)

### Logistic Regression

In [16]:
lr = LogisticRegression()

# Fitting the Model with Train Dataset
lr.fit(train_tf_idf, train.label)

LogisticRegression()

In [17]:
train_prediction = lr.predict(train_tf_idf)

In [18]:
test_prediction = lr.predict(test_tf_idf)

### Evaluation

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve,classification_report

In [20]:
print("Train dataset accuracy score: ", accuracy_score(train.label , train_prediction))
print("Test dataset accuracy score :",  accuracy_score(test.label , test_prediction))

Train dataset accuracy score:  0.9499784895772224
Test dataset accuracy score : 0.9474425152510558


In [21]:
from sklearn.metrics import classification_report

# Classification Report on Testing Dataset
print(f'Classification Report \n{classification_report(test.label ,test_prediction)}')

Classification Report 
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5945
           1       0.82      0.32      0.46       448

    accuracy                           0.95      6393
   macro avg       0.89      0.66      0.72      6393
weighted avg       0.94      0.95      0.94      6393



In [22]:
# Classification Report on Testing Dataset
print(f'Classification Report \n{classification_report(train.label ,train_prediction)}')

Classification Report 
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     23775
           1       0.87      0.34      0.49      1794

    accuracy                           0.95     25569
   macro avg       0.91      0.67      0.73     25569
weighted avg       0.95      0.95      0.94     25569



In [23]:
from sklearn.metrics import f1_score

# f1 score on train data
print(f'F1 Score for Training Dataset: {f1_score(y_true= train.label, y_pred= train_prediction)}')

# f1 score on test data
print(f'F1 Score for Testing Dataset:  {f1_score(y_true= test.label, y_pred= test_prediction)}')

F1 Score for Training Dataset: 0.4881952781112444
F1 Score for Testing Dataset:  0.45980707395498394


---

---

## Pipeline

<pre><b>
As we know for the implementing the Logistic Regression Model ; We need the data in proper format i.e. the unnecessary 
words need to be remove before predicting the result

In order to implement this we need to create a pipeline and add both the models i.e. Tfidf and the Logistic Regression 
with the required parameters

In [24]:
pipeline = Pipeline(steps=[
    
    ('tfidf', TfidfVectorizer(lowercase= True,\
                             stop_words=ENGLISH_STOP_WORDS,\
                             max_features=1000)),
    
    ('lr_model', LogisticRegression())
    
])

In [25]:
# Fitting the Training Dataset
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=100,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
            

### Testing the Pipeline

In [26]:
text = ['This isnt great news for the Muslim Community!!']

pipeline.predict(text)

array([0], dtype=int64)

In [27]:
# text = ['You Mother Fucker!! Dont you have sense in communicating with the BOSS']
# text = ["@user #cnn calls #michigan middle school 'build the wall' chant '' #tcot "]

# text = ["I’m in the mood to kill a [racial slur], who’s with me?"]
# text = ["retweet if you agree! "]

text = ["I’m sick of these [religious group] thinking they are better than us, if any of you see someone wearing a [religious symbol of the religious group], grab it off them and post pics!"]

pipeline.predict(text)

array([0], dtype=int64)

In [28]:
label_1 = data[data['label']==1]
label_1['tweet']

13       @user #cnn calls #michigan middle school 'buil...
14       no comment!  in #australia   #opkillingbay #se...
17                                  retweet if you agree! 
23         @user @user lumpy says i am a . prove it lumpy.
34       it's unbelievable that in the 21st century we'...
                               ...                        
31934    lady banned from kentucky mall. @user  #jcpenn...
31946    @user omfg i'm offended! i'm a  mailbox and i'...
31947    @user @user you don't have the balls to hashta...
31948     makes you ask yourself, who am i? then am i a...
31960    @user #sikh #temple vandalised in in #calgary,...
Name: tweet, Length: 2242, dtype: object

<pre> <b>
x-train = input for training (80%) -->  y-train = output of training(80%)
x-test  = input for testing (20%)  -->  y-test = output of testng(20%)

#### ===================================================================================================

### Balance the Imbalance dataset

In [29]:
data.shape

(31962, 2)

In [30]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [31]:
print(f'Class 0 :: percentage of data : {round(data.label.value_counts()[0] / data.shape[0] * 100,2)}%')
print(f'Class 1 :: percentage of data : {round(data.label.value_counts()[1] / data.shape[0] * 100,2)}%')

more = round(data.label.value_counts()[0] / data.shape[0] * 100,2) / round(data.label.value_counts()[1] / data.shape[0] * 100,2)

print(f'\nClass 0 is {round(more,2)}x times more than Class 1')

Class 0 :: percentage of data : 92.99%
Class 1 :: percentage of data : 7.01%

Class 0 is 13.27x times more than Class 1
