# Applying various ML models on twitter_tweets dataset to mark the inappropriate tweets

In [1]:
# Importing basic libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing libraries for NLP

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer() 

In [9]:
# Reading the dataset
tweets=pd.read_csv(r'C:/Users/Hemant Raturi/Downloads/DA & ML/ML twitter_tweets.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
# Checking for any null values in dataset
tweets.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [11]:
tweets.shape

(31962, 3)

Dataset contains 31962 tweets and tweets will be out feature matrix(X) and label will be our prediction vector(y).
We will need to clean our tweets and will follow the following steps for each of the tweet in dataset.
1. Remove the '@user' from tweet as it is of no use.
2. Remove all the puncutations, symbols, numbers, emojis etc.
3. unifrom casing, Lowercase is preferred.
4. Splitting the tweet to create list of all words in a tweet.
5. Remove all the unwanted words like prepositions,conjunctions, pronouns, determiners etc and applying Stemming ya        Lemmatization.
6. Joining the tweet back to make a string.

In [12]:
clean_tweet=[]

for i in range(31962):
    temp=tweets['tweet'][i].replace('@user', '')
    temp=re.sub('[^a-zA-Z]', ' ', temp)
    temp=temp.lower()
    temp=temp.split()
    temp = [ps.stem(word) for word in temp 
     if not word in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    clean_tweet.append(temp)

In [13]:
clean_tweet[:20]

['father dysfunct selfish drag kid dysfunct run',
 'thank lyft credit use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model love u take u time ur',
 'factsguid societi motiv',
 'huge fan fare big talk leav chao pay disput get allshowandnogo',
 'camp tomorrow danni',
 'next school year year exam think school exam hate imagin actorslif revolutionschool girl',
 'love land allin cav champion cleveland clevelandcavali',
 'welcom gr',
 'ireland consum price index mom climb previou may blog silver gold forex',
 'selfish orlando standwithorlando pulseshoot orlandoshoot biggerproblem selfish heabreak valu love',
 'get see daddi today day gettingf',
 'cnn call michigan middl school build wall chant tcot',
 'comment australia opkillingbay seashepherd helpcovedolphin thecov helpcovedolphin',
 'ouch junior angri got junior yugyoem omg',
 'thank paner thank posit',
 'retweet agre',
 'friday smile around via ig user cooki make peopl',
 'know essenti oil made chemic']

Our tweets are now cleaned and now we will apply our 'Bag of words' model on it. For that we will import countVectorizer from sklearn.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
# Keeping the max_features 500(top words)
cv = CountVectorizer(max_features = 500)

In [20]:
X = cv.fit_transform(clean_tweet)
X

<31962x500 sparse matrix of type '<class 'numpy.int64'>'
	with 119240 stored elements in Compressed Sparse Row format>

To see the sparse matrix created we will convert X to array

In [21]:
# X -> Feature matrix
X=X.toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
# y -> Prediction vector
y=tweets['label'].values

In [26]:
y

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

Now we will apply various ML algorithms and check which one performes the best.

    Applying the Logistic Regression Model

In [27]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [28]:
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
log_reg.score(X, y)

0.946999561979851

    Applying the KNN Model

In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [31]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [32]:
knn.score(X, y)

0.9525061009949315

    Applying the Decision tree Model

In [34]:
from sklearn.tree import DecisionTreeClassifier
dtf = DecisionTreeClassifier()

In [35]:
dtf.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [36]:
dtf.score(X, y)

0.9865465239972467

    Applying the Naive bayes Model

In [37]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [38]:
nb.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [39]:
nb.score(X, y)

0.5110443651836556

Amongst all the Models we applied Decisoion tree perfomed best with output of 98%. It might be due to overfitting of the model.