Sklearn provides facilities to extract numerical features from a text document by tokenizing, counting and normalising. CountVectorizer performs the task of tokenizing and counting, while TfidfTransformer normalizes the data. TfidfVectorizer, on the other hand, performs all three operations, thereby streamlining the process of natural language processing.

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection  import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [6]:
disaster_tweet = pd.read_csv("disastor_tweets.csv", sep=",")
disaster_tweet

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...,...
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0


In [7]:
vectorizer = CountVectorizer(binary=True, stop_words="english")
X = vectorizer.fit_transform(disaster_tweet["text"])
disaster_tweet_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
disaster_tweet_df

Unnamed: 0,00,000,00009,000ft,000kg,007,0089,00am,00pm,00u3qm1ucs,...,𝐲𝐨𝐮,𝒋𝒊𝒍𝒍,𝒗𝒂𝒍𝒆𝒏𝒕𝒊𝒏𝒆,𝗖𝗢𝗥𝗧,𝗘𝗻𝗲𝗿𝗴𝘆,𝗚𝗶𝘃𝗲𝗮𝘄𝗮𝘆,𝗜𝗻𝗰,𝗠𝗔𝗬,𝗣𝗲𝘁𝗿𝗼𝘁𝗲𝗾,𝗳𝗼𝗿
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train_x, test_x, train_y, test_y = train_test_split(disaster_tweet_df, disaster_tweet["target"], test_size=0.3, random_state=123)

df_train_x = pd.DataFrame(train_x, columns=disaster_tweet_df.columns)
df_test_x = pd.DataFrame(test_x, columns=disaster_tweet_df.columns)
df_train_y = pd.DataFrame(train_y, columns=["target"])
df_test_y = pd.DataFrame(test_y, columns=["target"])


In [9]:
print ("shapes")
print (df_train_x.shape)
print (df_test_x.shape)
print (df_train_y.shape)
print (df_test_y.shape)

shapes
(7959, 26821)
(3411, 26821)
(7959, 1)
(3411, 1)


In [10]:
print ("class counts")
print (disaster_tweet["target"].value_counts())
print (df_train_y["target"].value_counts())
print (df_test_y["target"].value_counts())

class counts
0    9256
1    2114
Name: target, dtype: int64
0    6478
1    1481
Name: target, dtype: int64
0    2778
1     633
Name: target, dtype: int64


Different scores ran by different modules. 

In [11]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.5719207579672696
accuracy:0.8542949281735561
precision:0.5244865718799369
recall:0.6287878787878788


In [13]:
clf = LinearSVC()
clf = clf.fit(df_train_x, train_y)
pred_y = clf.predict(df_test_x)

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.6572934973637962
accuracy:0.8856640281442393
precision:0.5908372827804107
recall:0.7405940594059406


In [12]:
clf = MultinomialNB()
clf = clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.6145069274653627
accuracy:0.8613309879800645
precision:0.5955766192733017
recall:0.6346801346801347


In [11]:
clf = LogisticRegression()
clf = clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.6556420233463035
accuracy:0.8962181178540017
precision:0.5323854660347551
recall:0.8531645569620253
