In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import os
import feather
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [3]:
data = pd.read_csv('train.csv/train.csv')

In [35]:
data.shape

(1306122, 3)

In [8]:
train_raw = feather.read_dataframe('tmp/train-raw')

In [34]:
train_raw.shape

(1306122, 3)

In [4]:
train_raw.head().T

Unnamed: 0,0,1,2,3,4
qid,00002165364db923c7e6,000032939017120e6e44,0000412ca6e4628ce2cf,000042bf85aa498cd78e,0000455dfa3e01eae3af
question_text,How did Quebec nationalists see their province...,"Do you have an adopted dog, how would you enco...",Why does velocity affect time? Does velocity a...,How did Otto von Guericke used the Magdeburg h...,Can I convert montra helicon D to a mountain b...
target,0,0,0,0,0


In [5]:
train_raw['length'] = train_raw['question_text'].apply(len)

In [6]:
train_raw.head()

Unnamed: 0,qid,question_text,target,length
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,72
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,81
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,67
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,57
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,77


### tokenizer

In [10]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

### split and fit transform

In [46]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 74574
n_trn = len(question_text)-n_valid
X_train, X_valid = split_vals(question_text, n_trn)
y_train, y_valid = split_vals(target, n_trn)


In [47]:
veczr = CountVectorizer(tokenizer=tokenize)

In [48]:
X_text = veczr.fit_transform(X_train)
val_text = veczr.transform(X_valid)

In [15]:
X_text

<1246122x193682 sparse matrix of type '<class 'numpy.int64'>'
	with 17095656 stored elements in Compressed Sparse Row format>

In [16]:
val_text

<60000x193682 sparse matrix of type '<class 'numpy.int64'>'
	with 817854 stored elements in Compressed Sparse Row format>

# Naive Bayes

In [39]:
def prior(y_i):
    p=X[y==y_i].sum(0)
    return (p+1)/((y==y_i).sum()+1)

In [40]:
def f1_score_all(y_true, y_pred):
    recall =(y_true[y_pred==1]==1).sum()/(y_true==1).sum()   # true positive / ground truth positive
    precision = (y_true[y_pred==1]==1).sum() / (y_pred==1).sum() #true positive / predtictive positive
    f1 = 2/(1/recall + 1/precision)
    return f1,recall,precision

In [49]:
X = X_text
y = y_train

r = np.log(prior(1)/prior(0))
b = np.log((y==1).mean()/(y==0).mean())

In [128]:
X[y_train==1]

<77101x193682 sparse matrix of type '<class 'numpy.int64'>'
	with 1389113 stored elements in Compressed Sparse Row format>

In [50]:
pre_preds = val_text @ r.T + b
preds = [1 if i >10 else 0 for i in pre_preds]
(preds == y_valid).mean()

0.9351650709362512

In [51]:
preds_array = np.array(preds)

In [52]:
f1,re,pre = f1_score_all(preds_array,y_valid)

In [53]:
f1,re,pre

(0.49121330106282224, 0.47837671654027464, 0.504757785467128)

In [54]:
confusion_matrix(y_valid,preds_array)

array([[67405,  2545],
       [ 2290,  2334]])

# Logistic Regression

In [55]:
m = LogisticRegression(C=1e8, dual=True)
m.fit(X, y)
preds = m.predict(val_text)
(preds==y_valid).mean()

0.9135489580819052

In [56]:
f1,re,pre = f1_score_all(preds,y_valid)

In [57]:
f1,re,pre

(0.5175484546883184, 0.3956974482206202, 0.7478373702422145)

In [58]:
confusion_matrix(y_valid,preds)

array([[64669,  5281],
       [ 1166,  3458]])

In [62]:
m = LogisticRegression(C=1e8, dual=True)
m.fit(X.sign(), y)
preds = m.predict(val_text.sign())
(preds==y_valid).mean()

0.8756666666666667

In [147]:
f1,re,pre = f1_score(preds_array,y_valid)

In [63]:
f1,re,pre

(0.5372477064220184, 0.4917151813703538, 0.592073335130763)

In [54]:
confusion_matrix(y_valid,preds)

array([[54700,  1591],
       [ 1210,  2499]])

# train with NB-naive

In [59]:
veczr =  CountVectorizer(tokenizer = tokenize  ,ngram_range=(1,3), max_features=700000)
trn_term_doc = veczr.fit_transform(X_train)
val_term_doc = veczr.transform(X_valid)

In [65]:
val_term_doc

<60000x700000 sparse matrix of type '<class 'numpy.int64'>'
	with 1760899 stored elements in Compressed Sparse Row format>

In [66]:
trn_term_doc

<1246122x700000 sparse matrix of type '<class 'numpy.int64'>'
	with 37244638 stored elements in Compressed Sparse Row format>

In [60]:
y=y_train
X=trn_term_doc.sign()
X_val = val_term_doc.sign()

In [61]:
r = np.log(prior(1) / prior(0))
b = np.log((y==1).mean() / (y==0).mean())

In [68]:
x_nb = X.multiply(r)
m = LogisticRegression(dual=True, C=0.1)
m.fit(x_nb, y);

val_x_nb = X_val.multiply(r)
preds = m.predict(val_x_nb)
(preds.T==y_valid).mean()

0.9536165419583232

In [63]:
f1,re,pre = f1_score_all(preds,y_valid)

In [64]:
f1,re,pre

(0.6360906926182834, 0.61012909632572, 0.6643598615916955)

In [65]:
confusion_matrix(y_valid,preds)

array([[67987,  1963],
       [ 1552,  3072]])