In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
train_tweet=pd.read_csv(r'train_tweets.csv')
train_tweet.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
train_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [4]:
train_tweet['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [5]:
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

In [6]:
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", "",tweet.lower()).split())

In [7]:
train_tweet['processed_tweets'] = train_tweet['tweet'].apply(process_tweet)

In [8]:
train_tweet.head(10)

Unnamed: 0,id,label,tweet,processed_tweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,22 huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so gr8


In [9]:
drop_features(['id','tweet'],train_tweet)

In [10]:
train_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   label             31962 non-null  int64 
 1   processed_tweets  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 499.5+ KB


In [11]:
train_tweet.head()

Unnamed: 0,label,processed_tweets
0,0,when a father is dysfunctional and is so selfi...
1,0,thanks for lyft credit i cant use cause they d...
2,0,bihday your majesty
3,0,model i love u take with u all the time in ur
4,0,factsguide society now motivation


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(train_tweet["processed_tweets"],train_tweet["label"], test_size = 0.2, random_state = 3)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [15]:
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [16]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [17]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [18]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(25569, 35561)
(25569, 35561)


In [19]:
print(x_test_counts.shape)
print(x_test_tfidf.shape)

(6393, 35561)
(6393, 35561)


# Decision tree classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier#model1
m1=DecisionTreeClassifier()
m1.fit(x_train_tfidf,y_train)

DecisionTreeClassifier()

In [21]:
prediction=m1.predict(x_test_tfidf)
print(prediction)

[0 0 0 ... 0 0 0]


In [22]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score
c_m = confusion_matrix(y_test,prediction)
print(c_m)

[[5800  150]
 [ 176  267]]


In [23]:
accuracy_score(y_test,prediction)

0.9490067261066791

# Logistic Regression classifier

In [24]:
from sklearn.linear_model import LogisticRegression
m2=LogisticRegression()
m2.fit(x_train_tfidf,y_train)

LogisticRegression()

In [25]:
prediction2=m2.predict(x_test_tfidf)
print(prediction2)

[0 0 0 ... 0 0 0]


In [26]:
c_m=confusion_matrix(y_test,prediction2)
print(c_m)

[[5936   14]
 [ 312  131]]


In [27]:
accuracy_score(y_test,prediction2)

0.9490067261066791

# Support Vector Machine Classifier

In [28]:
from sklearn.svm import SVC#model3
m3=SVC()
m3.fit(x_train_tfidf,y_train)

SVC()

In [29]:
prediction3=m3.predict(x_test_tfidf)
print(prediction3)

[0 0 0 ... 0 0 0]


In [30]:
c_m=confusion_matrix(y_test,prediction3)
print(c_m)

[[5935   15]
 [ 253  190]]


In [31]:
accuracy_score(y_test,prediction3)

0.9580791490692946

# Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier#model4
m4 = RandomForestClassifier(n_estimators=200)
m4.fit(x_train_tfidf,y_train)

RandomForestClassifier(n_estimators=200)

In [33]:
prediction4=m4.predict(x_test_tfidf)
print(prediction4)

[0 0 0 ... 0 0 0]


In [34]:
c_m=confusion_matrix(y_test,prediction4)
print(c_m)

[[5920   30]
 [ 196  247]]


In [35]:
accuracy_score(y_test,prediction4)

0.9646488346629125