In [1]:
import numpy as np
import pandas as pd

### 2

In [2]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label','msg'])
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 3

In [3]:
X = data.msg.values
y = data.label.apply(lambda m: 1 if m == 'spam' else 0).values

### 4

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer() 
X_transformed = vect.fit_transform(X)

### 5

In [10]:
from sklearn.model_selection import cross_val_score as cv_score
from sklearn.linear_model import LogisticRegression as LR

cv_score(LR(), X_transformed, y, scoring='f1', cv=10).mean()

0.9326402983610631

### 6

In [11]:
X_test = ["FreeMsg:	Txt:	CALL	to	No:	86888	&	claim	your	reward	of	3	hours	talk	time	to	use	from	your	phone	now!	Subscribe6GB",
"FreeMsg:	Txt:	claim	your	reward	of	3	hours	talk	time",
"Have	you	visited	the	last	lecture	on	physics?",
"Have	you	visited	the	last	lecture	on	physics?	Just	buy	this	book	and	you	will	have	all	materials!	Only	99$",
"Only	99$"]

clf = LR().fit(X_transformed, y)
X_test_transformed = vect.transform(X_test)
print ' '.join(map(str, clf.predict(X_test_transformed)))

1 1 0 0 0


### 7

In [13]:
results = []
results.append(cv_score(LR(), CountVectorizer(ngram_range=(2,2)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cv_score(LR(), CountVectorizer(ngram_range=(3,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cv_score(LR(), CountVectorizer(ngram_range=(1,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
print ' '.join(map(lambda f: '{0:.2f}'.format(f), results))

0.82 0.73 0.93


### 8

In [14]:
from sklearn.naive_bayes import MultinomialNB as NB

results = []
results.append(cv_score(NB(), CountVectorizer(ngram_range=(2,2)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cv_score(NB(), CountVectorizer(ngram_range=(3,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cv_score(NB(), CountVectorizer(ngram_range=(1,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
print ' '.join(map(lambda f: '{0:.2f}'.format(f), results))

0.65 0.38 0.89


### 9

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv_score(LR(), TfidfVectorizer().fit_transform(X), y, scoring='f1', cv=10).mean()

0.85285995541724557

### 10

In [20]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LR())
    ])

params_grid = [{
        'vect__ngram_range': [(1,1), (1,2), (1,3)],
        'vect__binary': [True, False],
        'vect__min_df': [1, 0.05, 0.1],
        'vect__max_df': [0.3, 0.4, 0.5],
        'clf__penalty': ['l1', 'l2']
    }]

grid_search = GridSearchCV(pipeline, params_grid, scoring='f1', cv=10)
grid_search.fit(X, y)

CPU times: user 8min 12s, sys: 2.33 s, total: 8min 14s
Wall time: 8min 14s




In [25]:
print grid_search.best_score_
print grid_search.best_params_

0.87975762722
{'vect__ngram_range': (1, 1), 'clf__penalty': 'l2', 'vect__binary': False, 'vect__min_df': 1, 'vect__max_df': 0.3}


In [33]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', LR())
    ])

params_grid = [{
        'vect__ngram_range': [(1,1), (1,2), (1,3)],
        'vect__binary': [True, False],
        'vect__max_df': [0.1, 0.2],
        'clf__penalty': ['l1', 'l2']
    }]

grid_search = GridSearchCV(pipeline, params_grid, scoring='f1', cv=10)
grid_search.fit(X, y)

CPU times: user 1min 51s, sys: 316 ms, total: 1min 52s
Wall time: 1min 51s


In [34]:
print grid_search.best_score_
print grid_search.best_params_

0.935834740277
{'vect__ngram_range': (1, 2), 'clf__penalty': 'l2', 'vect__binary': False, 'vect__max_df': 0.2}


Actually there is no difference when to fit CountVectorizer

In [32]:
print cv_score(LR(), X_transformed, y, scoring='f1', cv=10).mean()
print cv_score(pipeline, X, y, scoring='f1', cv=10).mean()

0.932640298361
0.932640298361


### 11

Линейные и байсовские модели неплохо справляются с классификацией текстов. На данной выборке линейная модель показала себя несколько лучше. Учёт биграмм и триграмм может быть полезен для модели.