In [26]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [27]:
df = pd.read_csv('./datasets/smsspamcollection/smsspamcollection/SMSSpamCollection',sep='\t', names = ['label','message'])
df['label'] = df['label'].map({'spam':1, 'ham':0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
df.loc[df['label']==1,:]

Unnamed: 0,label,message
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."
12,1,URGENT! You have won a 1 week FREE membership ...
15,1,"XXXMobileMovieClub: To use your credit, click ..."
19,1,England v Macedonia - dont miss the goals/team...
34,1,Thanks for your subscription to Ringtone UK yo...
42,1,07732584351 - Rodger Burns - MSG = We tried to...


In [29]:
df['message'].str.contains('[0-9]{3+}')

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
30      False
31      False
32      False
33      False
34      False
35      False
36      False
37      False
38      False
39      False
40      False
41      False
42      False
43      False
44      False
45      False
46      False
47      False
48      False
49      False
50      False
51      False
52      False
53      False
54      False
55      False
56      False
57      False
58      False
59      False
60      False
61      False
62      False
63      False
64      False
65      False
66      False
67      False
68      False
69      False
70      False
71    

In [30]:
df.loc[df['message'].str.contains('[0-9]{3,}'), 'label'].shape

(684L,)

In [31]:
df.loc[df['message'].str.contains('[A-Z]{3,}'), 'label'].shape #take mean then shape to compare how strong the feature is compare to the accuracy 

(895L,)

In [32]:
df.loc[df['message'].str.contains('www'), 'label'].mean()

0.9696969696969697

In [33]:
df['message'].apply(len)

0       111
1        29
2       155
3        49
4        61
5       148
6        77
7       160
8       158
9       154
10      109
11      136
12      156
13      196
14       35
15      149
16       26
17       81
18       58
19      156
20       41
21       49
22       53
23       88
24       57
25      144
26       30
27      134
28       75
29       64
30      130
31      189
32       29
33       84
34      159
35      123
36       47
37       28
38       27
39      155
40       82
41      142
42      172
43       19
44       72
45       32
46       45
47       31
48       67
49      148
50       58
51      124
52       80
53      289
54      120
55       76
56      161
57       34
58       22
59       40
60      108
61       48
62       25
63       56
64      110
65      153
66      122
67      161
68       78
69       34
70       46
71       29
72       45
73       42
74       20
75       43
76       73
77       50
78       42
79       76
80       22
81       32
82       32
83  

In [34]:
def message_length(df):
    return df['message'].apply(len).to_frame()
message_length_tf = FunctionTransformer(message_length, validate=False)


In [35]:
def has_number(df):
    return df['message'].str.contains('[0-9]{3,}').astype(int).to_frame()
has_number_tf = FunctionTransformer(has_number, validate=False)

In [36]:
def has_all_caps(df):
    return df['message'].str.contains('[A-Z]{3,}').astype(int).to_frame()
has_all_caps_tf = FunctionTransformer(has_all_caps, validate=False)

In [37]:
def has_www(df):
    return df['message'].str.contains('www').astype(int).to_frame()
has_www_tf = FunctionTransformer(has_www, validate=False)

In [38]:
def pass_message_column(df):
    return df['message']
pass_message_column_tf = FunctionTransformer(pass_message_column, validate=False)

In [39]:
def change_from_sparse_to_array(sparse_matrix):
    return sparse_matrix.toarray()
change_from_sparse_to_array_tf = FunctionTransformer(change_from_sparse_to_array, validate=False)

In [40]:
vect = CountVectorizer()

In [41]:
vector_pipeline = Pipeline(
    [
        ('pass_message_column',pass_message_column_tf),
        ('vect',vect),
        ('change_from_sparse_to_array',change_from_sparse_to_array_tf)
        
    ]
)


In [42]:
fu = FeatureUnion(
    [
    ('message_length_tf',message_length_tf),
    ('has_number_tf',has_number_tf),
    ('has_all_caps_tf',has_all_caps_tf),
    ('has_www_tf', has_www_tf),
    ('vector_pipeline',vector_pipeline)
    ]
)

In [43]:
fu.fit_transform(df)

array([[111,   0,   0, ...,   0,   0,   0],
       [ 29,   0,   0, ...,   0,   0,   0],
       [155,   1,   0, ...,   0,   0,   0],
       ..., 
       [ 57,   0,   0, ...,   0,   0,   0],
       [125,   0,   0, ...,   0,   0,   0],
       [ 26,   0,   0, ...,   0,   0,   0]], dtype=int64)

In [44]:
lr = LogisticRegression()
ss = StandardScaler()

In [45]:
pipe = Pipeline(
    [
        ('fu',fu),
        ('ss',ss),
        ('lr',lr)
        
    ]
)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(df, df['label'])

In [47]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('fu', FeatureUnion(n_jobs=1,
       transformer_list=[('message_length_tf', FunctionTransformer(accept_sparse=False,
          func=<function message_length at 0x0000000025BE7E48>,
          inv_kw_args=None, inverse_func=None, kw_args=None, pass_y=False,
          validate=False)), ('has_nu...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [48]:
pipe.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
params = {
    'fu__vector_pipeline__vect__stop_words':[None,'english']
}
gs = GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train)
print gs.best_score_
print gs.best_params_
gs.score(X_test,y_test)

0.992581957406
{'fu__vector_pipeline__vect__stop_words': 'english'}


0.98923187365398424

In [50]:
import spacy


In [51]:
nlp = spacy.load('en')