In [0]:
from google.colab import files
files.upload()

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c quora-insincere-questions-classification -f train.csv

Downloading train.csv.zip to /content
 86% 47.0M/54.4M [00:00<00:00, 65.4MB/s]
100% 54.4M/54.4M [00:00<00:00, 74.5MB/s]


In [0]:
from zipfile import ZipFile
file_name="train.csv.zip"
with ZipFile(file_name,'r') as zip:
    zip.extractall()
    print('Done')

Done


In [0]:
import pandas as pd
df=pd.read_csv('train.csv')
del df["qid"]
df.head()

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [0]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
question_text = df.question_text.str.cat(sep=' ')#function to split text into word
tokens = word_tokenize(question_text)
vocabulary = set(tokens)
print(len(vocabulary))
frequency_dist = nltk.FreqDist(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
308880


In [0]:
from wordcloud import STOPWORDS
stop_words = set(STOPWORDS)
tokens = [w for w in tokens if not w in stop_words]
frequency_dist = nltk.FreqDist(tokens)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["question_text"], df["target"], test_size=0.10)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

(1175509, 184347) (130613, 184347)


In [0]:
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,confusion_matrix

## Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_vectors,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
prediction = lr.predict(test_vectors)
print("f1 score {}".format(f1_score(y_test,prediction)))
print("accuracy {}".format(accuracy_score(y_test,prediction)))
print("precision score {}".format(precision_score(y_test,prediction)))
print("recall_score {}".format(recall_score(y_test,prediction)))
cm_custom = confusion_matrix(y_test,prediction)
print(cm_custom)

f1 score 0.5406233249100237
accuracy 0.954070421780374
precision score 0.7104045079492856
recall_score 0.4363411619283066
[[121084   1439]
 [  4560   3530]]




## MultiNomial Naive Bayes Classification

In [0]:
from sklearn.naive_bayes import MultinomialNB
mn = MultinomialNB()
mn.fit(train_vectors,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
prediction = mn.predict(test_vectors)
print("f1 score {}".format(f1_score(y_test,prediction)))
print("accuracy {}".format(accuracy_score(y_test,prediction)))
print("precision score {}".format(precision_score(y_test,prediction)))
print("recall_score {}".format(recall_score(y_test,prediction)))
cm_custom = confusion_matrix(y_test,prediction)
print(cm_custom)

f1 score 0.13401245048104132
accuracy 0.941422369901924
precision score 0.7946308724832215
recall_score 0.07317676143386898
[[122370    153]
 [  7498    592]]


## K nearest neighbours

In [0]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(train_vectors,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [0]:
prediction = neigh.predict(test_vectors)
print("f1 score {}".format(f1_score(y_test,prediction)))
print("accuracy {}".format(accuracy_score(y_test,prediction)))
print("precision score {}".format(precision_score(y_test,prediction)))
print("recall_score {}".format(recall_score(y_test,prediction)))
cm_custom = confusion_matrix(y_test,prediction)
print(cm_custom)

f1 score 0.2114395238575608
accuracy 0.9401514397494889
precision score 0.5822222222222222
recall_score 0.12917539751016888
[[121748    752]
 [  7065   1048]]
