In [94]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

In [77]:
df = pd.read_csv('./datasets/smsspamcollection/smsspamcollection/SMSSpamCollection', sep ='\t', names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [79]:
df['message len'] = df['message'].apply(len)
df.corr()

Unnamed: 0,label,message len
label,1.0,0.383587
message len,0.383587,1.0


In [78]:
df['label'] = df['label'].map({'spam':1, 'ham':0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
1 - df['label'].mean() #baseline accuracy

0.8659368269921034

In [15]:
corpus = ['See spot','See spot run', 'The baby wolf looked for the wolves']
cv = CountVectorizer()
dtm = cv.fit_transform(corpus)

In [12]:
dtm.toarray()

array([[0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 2, 1, 1]], dtype=int64)

In [14]:
dtm_df = pd.DataFrame(dtm.toarray() , columns=cv.get_feature_names())
dtm_df

Unnamed: 0,baby,for,looked,run,see,spot,the,wolf,wolves
0,0,0,0,0,1,1,0,0,0
1,0,0,0,1,1,1,0,0,0
2,1,1,1,0,0,0,2,1,1


In [117]:
words = ['for','and','the','you','to',"don't"]

#X_dtm = vect.fit_transform(df['message'])
#dtm_df = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
#dtm_df.head()
vect = CountVectorizer() #can do Tfidvectorizer 
lr = LogisticRegression()
pipe = Pipeline([('vect',vect),('lr',lr)])
params = {
    'vect__ngram_range':[(1,1),(1,2)],
    'lr__penalty':['l2'],
    'vect__lowercase':[True,False],
    'vect__max_df':[.2],
    'vect__min_df':[0,.1],
    'lr__solver':['newton-cg','liblinear','lbfgs','sag'],
    'lr__max_iter' :range(100,150)
    
   
}
gs = GridSearchCV(pipe, param_grid=params, cv = 3)
gs.fit(df['message'], df['label'])
gs.best_score_

0.98330940416367552

In [111]:
gs.best_params_

{'lr__max_iter': 1000,
 'lr__penalty': 'l2',
 'lr__solver': 'sag',
 'vect__lowercase': True,
 'vect__max_df': 0.2,
 'vect__min_df': 0,
 'vect__ngram_range': (1, 1)}

In [34]:
#dtm_df['label'] = df['label']
#dtm_df.shape

In [33]:
#dtm_df.groupby('label').mean().iloc[0, :].sort_values(ascending = False)

In [119]:
amazon = './datasets/sentiment/sentiment/amazon_cells_labelled.txt'

In [166]:
amazon_df = pd.read_csv(amazon, sep='\t', names=['Review','Liked'])
amazon_df

Unnamed: 0,Review,Liked
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [167]:
amazon_df.Liked.mean() #baseline accuracy

0.5

In [153]:
ama = conv.fit_transform(amazon_df['Review'])
ama_df = pd.DataFrame(ama.toarray(), columns=conv.get_feature_names())
ama_df.columns

Index([u'10', u'100', u'11', u'12', u'13', u'15', u'15g', u'18', u'20',
       u'2000',
       ...
       u'wrongly', u'year', u'years', u'yell', u'yes', u'yet', u'you', u'your',
       u'z500a', u'zero'],
      dtype='object', length=1847)

In [176]:
conv = TfidfVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('conv',conv),('lr',lr)])

In [177]:
params = {
    'conv__ngram_range':[(1,1),(1,2),(1,3),(2,3)],
    'lr__penalty':['l2'],
    'conv__lowercase':[True,False],
    'conv__max_df':[.1,.2],
    'conv__stop_words':[None,'english'],
    'conv__strip_accents':['ascii','unicode',None],
    'conv__binary':[False],
    'lr__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag'],
    
    
}
gs = GridSearchCV(pipe, param_grid=params, cv = 10)
gs.fit(amazon_df['Review'], amazon_df['Liked'])
gs.best_score_

0.82799999999999996

In [178]:
gs.best_params_

{'conv__binary': False,
 'conv__lowercase': True,
 'conv__max_df': 0.2,
 'conv__ngram_range': (1, 3),
 'conv__stop_words': None,
 'conv__strip_accents': 'ascii',
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}