In [22]:
#import necessary libraries
import pandas as pd
import numpy as np
import re
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
df=pd.read_csv("/home/kanisk/Downloads/naive_bayes.txt",sep='\t',names=['exist','tweet','confidence'])

In [31]:
#check missing values
print("Checking missing values\n\n")
print(df.isnull().sum())

Checking missing values


exist         0
tweet         0
confidence    4
dtype: int64


In [34]:
#Making data-frame
dfz = pd.DataFrame(df)

In [39]:
#counts in each class
print("\n")
print("Counts in each class\n")
print(dfz['exist'].value_counts())



Counts in each class

1        3114
0        2976
Exist       1
Name: exist, dtype: int64


In [48]:
X = dfz['tweet']
y = dfz['exist']

In [49]:
#change text lower cases and removal of white spaces
lower_text = []
for i in range(0,len(X)):
    s = str(X[i])
    s1 = s.strip()
    lower_text.append(s1.lower())
print("After converting text to lower case\n\n",lower_text)

After converting text to lower case



In [57]:
#Remove punctuation
punc_text = []
for i in range(0,len(lower_text)):
    s2 = (lower_text[i])
    s3=s2
    punc_text.append(s3)
print("After removed unncecessary punctuation\n\n",punc_text)

After removed unncecessary punctuation



In [59]:
#Word vectorization
#Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,max_df = 0.7,norm='l2', encoding='latin-1', ngram_range=(1, 2),
stop_words='english')

In [65]:
#transform independent variable using TF-IDF vectorizer
X_tfidf = tfidf.fit_transform(punc_text)
print("After vectorized text data\n\n",X_tfidf)

After vectorized text data

   (0, 3519)	1.0
  (1, 1548)	0.08418708942245767
  (1, 3647)	0.0852629026086378
  (1, 2850)	0.2255528376710184
  (1, 3563)	0.37802539250085315
  (1, 1591)	0.3669774916790578
  (1, 88)	0.3216116763998092
  (1, 223)	0.2971191244773725
  (1, 3870)	0.22387436218556508
  (1, 1354)	0.34690279861849127
  (1, 1887)	0.3378760063797201
  (1, 1824)	0.3922686138977398
  (1, 2136)	0.14007558972975975
  (1, 1556)	0.08580124604378157
  (2, 1548)	0.09498423510756134
  (2, 3647)	0.09619802327043715
  (2, 2136)	0.15804053613204713
  (2, 1556)	0.09680541021970812
  (2, 1413)	0.34248996336314463
  (2, 2677)	0.3651392719264638
  (2, 132)	0.3508428162103816
  (2, 1415)	0.44257776938705823
  (2, 2679)	0.4265078292200801
  (2, 3650)	0.44257776938705823
  (3, 1548)	0.06975312045743971
  :	:
  (6088, 1772)	0.12272568300475559
  (6089, 1548)	0.1280888056062667
  (6089, 3647)	0.12972563171604934
  (6089, 1556)	0.1305447094165209
  (6089, 367)	0.15294819199412527
  (6089, 2190)	0.146413

In [66]:
#Split the data into train and testing
X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, y, test_size=0.1, random_state=0)

In [67]:
#Print training data
print("\n")
print("Training data\n\n",X_train,"\n",Y_train)
print("\n\n")



Training data

   (0, 1772)	0.09825452986984609
  (0, 1771)	0.06981756537544734
  (0, 1420)	0.37483790135077766
  (0, 846)	0.29321621905272544
  (0, 3474)	0.38227551809034355
  (0, 1303)	0.5780306955746416
  (0, 164)	0.2644347066554074
  (0, 368)	0.09800955401839445
  (0, 2190)	0.093779882055273
  (0, 367)	0.09796514628049825
  (0, 378)	0.3488318571289214
  (0, 1556)	0.14157325958695074
  (0, 3647)	0.1406849853671142
  (0, 1548)	0.13890987851847444
  (1, 448)	0.48204443491999577
  (1, 915)	0.47425317497664016
  (1, 1772)	0.1426484413837215
  (1, 1771)	0.10136292845944533
  (1, 290)	0.463983727875745
  (1, 1246)	0.4470905335031292
  (1, 368)	0.14229277917219277
  (1, 2190)	0.1361520331536332
  (1, 367)	0.14222830688165655
  (1, 761)	0.11447323412450665
  (1, 566)	0.11301427129938067
  :	:
  (5478, 3647)	0.10801310343500804
  (5478, 1548)	0.10665023731855651
  (5479, 3254)	0.325955246316307
  (5479, 2246)	0.320352690303153
  (5479, 1779)	0.20638325475242433
  (5479, 1514)	0.20402361357

In [68]:
#Print testing data
print("Testing data\n\n",X_test)
print("\n\n")

Testing data

   (0, 1166)	0.3782305693402206
  (0, 1165)	0.3377818662230461
  (0, 3425)	0.23642752879213905
  (0, 1658)	0.33448825212764194
  (0, 611)	0.3588742856187612
  (0, 1277)	0.2644875646029164
  (0, 1959)	0.287664964688227
  (0, 761)	0.07801347885850576
  (0, 566)	0.07701919607804796
  (0, 760)	0.07475829558564294
  (0, 132)	0.5076616196943124
  (0, 2136)	0.13506273042779235
  (1, 255)	0.3146643703249196
  (1, 1858)	0.32835963925502193
  (1, 1857)	0.32835963925502193
  (1, 941)	0.2811132443380015
  (1, 3016)	0.3146643703249196
  (1, 254)	0.2701066959936432
  (1, 1305)	0.28685583157722344
  (1, 3812)	0.27297479336486696
  (1, 1199)	0.23569714785391077
  (1, 1196)	0.19717313627203256
  (1, 1026)	0.20588799155177534
  (1, 2473)	0.27297479336486696
  (1, 3015)	0.19036242418874338
  :	:
  (608, 2605)	0.5210873330485095
  (608, 1807)	0.3701915066225424
  (608, 3694)	0.2809267000768328
  (608, 1771)	0.09875550605644591
  (608, 3909)	0.35980483822116593
  (608, 854)	0.2256391603664723

In [69]:
#Build the Decision tree model
clf = DecisionTreeClassifier()

#Fit train and test into the model
clf.fit(X_train, Y_train)

#Predict the result
y_pred = clf.predict(X_test)

In [70]:
#classification report & confusion matrix
print("Confusion Matrix\n",confusion_matrix(Y_test,y_pred))
print("\n")
print("Classification Report\n",classification_report(Y_test,y_pred))
print("\n")
print("Accuracy : ",accuracy_score(Y_test,y_pred)*100)

Confusion Matrix
 [[199  99]
 [ 94 218]]


Classification Report
               precision    recall  f1-score   support

           0       0.68      0.67      0.67       298
           1       0.69      0.70      0.69       312

   micro avg       0.68      0.68      0.68       610
   macro avg       0.68      0.68      0.68       610
weighted avg       0.68      0.68      0.68       610



Accuracy :  68.36065573770492
