# HW 6, Kartozia

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn import model_selection

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

### Home task: Spam detection

Для заданной тестовой выборки построить модель для предсказания является ли sms сообщение спамом.  
На заданном разбиении (df_train, df_test) ваша модель должна превзойти baseline'ы, приведенные ниже.  

Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества F1


baseline 1: 0.9444      bag of words + Multinomial Naive Bayes  
baseline 2: 0.9490      symbol 3-grams with IDF and l2-norm + Logistic Regression  
baseline 3: 0.9636      text stemming + baseline 2

**baseline 4: 0.9658      text stemming + Logistic Regression (word 3-grams)  + Multinomial Naive Bayes (symbol 3-grams) **

! Your results must be reproducible. Если ваша модель - стохастическая (как например LogisticRegression), то вы явно должны задавать все seed и random_state в параметрах моделей  
! Вы должны использовать df_test только для измерения качества конечной обученной модели. 

In [2]:
#load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# dataset size
df.shape

(5572, 2)

In [4]:
# class proportions
df.target.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: target, dtype: float64

In [5]:
# Use  df_train for model training
# Use df_test as  hold-out dataset for your final model perfomance estimation.
# You cannot change  this splitting
# All results must be reproducible
SEED = 1337
df_train, df_test = model_selection.train_test_split(df, test_size=0.4, random_state=SEED, shuffle=True, stratify=df.target)
print('train size %d, test size %d' % (df_train.shape[0], df_test.shape[0]))

train size 3343, test size 2229


In [6]:
#baseline4 aka hw 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import string
import re
import warnings
warnings.filterwarnings('ignore')

stemmer = SnowballStemmer("english")
regex = re.compile('[%s]' % re.escape(string.punctuation))

# basic preprocessing
# make lowercase, remove punctuation and make stemming
def text_process(text):
    text = text.lower()
    text = regex.sub(' ', text)
    text = [stemmer.stem(word) for word in text.split() if word not in stopwords.words('english')]
    return " ".join(text)

df_train['text2'] = df_train['text'].apply(text_process)
df_test['text2'] = df_test['text'].apply(text_process)
df_train.head()

Unnamed: 0,target,text,text2
2175,ham,I'm at work. Please call,work pleas call
4798,ham,I dont know why she.s not getting your messages,dont know get messag
3139,spam,sexy sexy cum and text me im wet and warm and ...,sexi sexi cum text im wet warm readi porn u fu...
683,spam,Hi I'm sue. I am 20 years old and work as a la...,hi sue 20 year old work lapdanc love sex text ...
3468,ham,All day working day:)except saturday and sunday..,day work day except saturday sunday


In [7]:
def find_num(x):
    return len(re.findall('[0-9]{4,1500}',x))

In [9]:
from tqdm import tqdm
from scipy import sparse
df_train['count_num'] = df_train['text2'].apply(find_num)
df_test['count_num'] = df_test['text2'].apply(find_num)


vec_word = TfidfVectorizer(lowercase=True, analyzer='word', ngram_range=(1,3), 
                      min_df=10, # exclude 3-grams appeared in less then 10 messages. 
                      use_idf=True)
X_train_word = vec_word.fit_transform(df_train['text2'])
X_train_word = sparse.csr_matrix(pd.concat([pd.DataFrame(X_train_word.toarray(),columns=vec_word.get_feature_names(),index=df_train.index),df_train['count_num']],axis=1))   # Here's the initialization of the sparse matrix.
X_test_word = vec_word.transform(df_test['text2'])
X_test_word = sparse.csr_matrix(pd.concat([pd.DataFrame(X_test_word.toarray(),columns=vec_word.get_feature_names(),index=df_test.index),df_test['count_num']],axis=1))


vec_char = TfidfVectorizer(lowercase=True, analyzer='char', ngram_range=(3,3), min_df=5, norm='l2',
# exclude 3-grams appeared in less then 10 messages. 
                      use_idf=True)
X_train_char = vec_char.fit_transform(df_train['text2'])
X_train_char = sparse.csr_matrix(pd.concat([pd.DataFrame(X_train_char.toarray(),columns=vec_char.get_feature_names(),index=df_train.index),df_train['count_num']],axis=1))   # Here's the initialization of the sparse matrix.
X_test_char = vec_char.transform(df_test['text2'])
X_test_char = sparse.csr_matrix(pd.concat([pd.DataFrame(X_test_char.toarray(),columns=vec_char.get_feature_names(),index=df_test.index),df_test['count_num']],axis=1))

y_train = (df_train['target'] == 'spam').astype(int)
y_test = (df_test['target'] == 'spam').astype(int)

grid_word = GridSearchCV(LogisticRegression(random_state=SEED), 
                   param_grid={'C': np.logspace(-3,5,20), 
                               'penalty': ['l1','l2'],
                               'tol':np.logspace(-6,-4,3)}, 
                    scoring='f1', n_jobs=-1, cv=5, verbose=True, return_train_score=True)
grid_word.fit(X_train_word, y_train)
print(grid_word.best_params_)
print(grid_word.best_score_)
model_word = grid_word.best_estimator_
y_pred_word = model_word.predict_proba(X_train_word)[:,1]


grid_char = GridSearchCV(LogisticRegression(random_state=SEED), 
                   param_grid={'C': np.logspace(-3,5,20), 
                               'penalty': ['l1','l2'],
                               'tol':np.logspace(-6,-4,3)}, 
                    scoring='f1', n_jobs=-1, cv=5, verbose=True, return_train_score=True)
grid_char.fit(X_train_char, y_train)
print(grid_char.best_params_)
print(grid_char.best_score_)
model_char = grid_char.best_estimator_
y_pred_char = model_char.predict_proba(X_train_char)[:,1]

score_overall = []
for i in tqdm(range(100)):
    y_result = y_pred_word * i/100 + y_pred_char * (1 - i/100)
    score = []
    for threshold in range(1000):
        y_pred = (y_result > (threshold/1000)).astype(int)
        score.append(metrics.f1_score(y_train, y_pred))
    score_overall.append([max(score),np.argmax(score) / 1000])

Fitting 5 folds for each of 120 candidates, totalling 600 fits
{'C': 16.237767391887211, 'penalty': 'l2', 'tol': 9.9999999999999995e-07}
0.945584092368
Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 593 out of 600 | elapsed:    9.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    9.2s finished
  0%|          | 0/100 [00:00<?, ?it/s]

{'C': 100000.0, 'penalty': 'l2', 'tol': 9.9999999999999995e-07}
0.944358465369


100%|██████████| 100/100 [01:33<00:00,  1.07it/s]


In [13]:
score_overall[40]

[1.0, 0.22500000000000001]

In [58]:
y_pred_prob = model_word.predict_proba(X_test_word)[:,1] * 0.50 + model_char.predict_proba(X_test_char)[:,1] * (1 - 0.50)
y_pred = (y_pred_prob > 0.28).astype(int)
metrics.f1_score(y_test, y_pred)

0.96587030716723543

In [11]:
# baseline 1
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# build binary feature matrix from BoW model
vec = TfidfVectorizer(lowercase=True, analyzer='word', ngram_range=(1,1), norm=None, use_idf=False, binary=True)
X = vec.fit_transform(df_train.text)
print('feature matrix shape', X.shape)

# encode class labels
label_enc = LabelEncoder().fit(df_train.target)
y_train = label_enc.transform(df_train.target)

# fit our prediction model
model = MultinomialNB(alpha=1.0)
model.fit(X, y_train)
print('train', metrics.f1_score(y_train, model.predict(X)))

# perfomance on test dataset
X_test = vec.transform(df_test.text)
y_pred = model.predict(X_test)
y_test = label_enc.transform(df_test.target)
print('test', metrics.f1_score(y_test, model.predict(X_test)))

feature matrix shape (3343, 6657)
train 0.979775280899
test 0.948096885813


In [12]:
# baseline 2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


# build feature matrix from 3-grams
# with l2-normalization and smoothed idf - look in docs for more details
vec = TfidfVectorizer(lowercase=True, analyzer='char', ngram_range=(3,3), norm='l2', use_idf=True, smooth_idf=True)
X = vec.fit_transform(df_train.text)
print('feature matrix shape', X.shape)

#encode labels
label_enc = LabelEncoder().fit(df_train.target)
y_train = label_enc.transform(df_train.target)

# Logistic Regression classifier has several hyperparams
# Optimize C (coeff before regularizer) and penalty (type of regularizer) using crossvalidation with grid search
# Basically it means it will look over every combination of hyperparams in the specified region (or lattice)
# and return the best one. 
# Look in docs  for more details
grid = GridSearchCV(LogisticRegression(random_state=SEED), # our model 
                   param_grid={'C': np.logspace(0,5,20), # C in lattice [10^0...10^5]
                               'penalty': ['l1', 'l2']}, 
                    scoring='f1', # our perfomance measure 
                    n_jobs=-1, # multithread 
                    cv=5, # 5-fold stratified cross-validation 
                    verbose=True, return_train_score=True)

grid.fit(X, y_train)
print('best params', grid.best_params_)
print('best estimator', grid.best_score_)
model = grid.best_estimator_

# grid.best_estimator_ is already fitted on whole train dataset
print('train', metrics.f1_score(y_train, model.predict(X)))

# perfomance on test dataset
X_test = vec.transform(df_test.text)
y_pred = model.predict(X_test)
y_test = label_enc.transform(df_test.target)
print('test', metrics.f1_score(y_test, model.predict(X_test)))

feature matrix shape (3343, 11749)
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    8.2s finished


best params {'C': 29763.514416313192, 'penalty': 'l2'}
best estimator 0.944847228112
train 1.0
test 0.949211908932


In [None]:
# baseline 3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import string
import re


stemmer = SnowballStemmer("english")
regex = re.compile('[%s]' % re.escape(string.punctuation))

# basic preprocessing
# make lowercase, remove punctuation and make stemming
def text_process(text):
    text = text.lower()
    text = regex.sub(' ', text)
    text = [stemmer.stem(word) for word in text.split() if word not in stopwords.words('english')]
    return " ".join(text)


df_train['text2'] = df_train['text'].apply(text_process)
df_test['text2'] = df_test['text'].apply(text_process)
df_train.head()

# build feature matrix from 3-grams
vec = TfidfVectorizer(lowercase=True, analyzer='char', ngram_range=(3,3), 
                      min_df=10, # exclude 3-grams appeared in less then 10 messages. 
                      use_idf=True)
X = vec.fit_transform(df_train.text2)
print('feature matrix shape', X.shape)

#encode labels
label_enc = LabelEncoder().fit(df_train.target)
y_train = label_enc.transform(df_train.target)

# optimize  hyperparams
grid = GridSearchCV(LogisticRegression(random_state=SEED), 
                   param_grid={'C': np.logspace(0,5,10), 
                               'penalty': ['l1', 'l2']}, 
                    scoring='f1', n_jobs=-1, cv=5, verbose=True, return_train_score=True)

grid.fit(X, y_train)
print(grid.best_params_)
print(grid.best_score_)
model = grid.best_estimator_
print('train', metrics.f1_score(y_train, model.predict(X)))

X_test = vec.transform(df_test.text2)
y_pred = model.predict(X_test)
y_test = label_enc.transform(df_test.target)
print('test', metrics.f1_score(y_test, model.predict(X_test)))