# TFIDF

In [1]:
%config IPCompleter.greedy=True

### Import packages

In [2]:
import numpy as np  
import pandas as pd 
## nltk
from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
## sklearn
from sklearn.metrics import average_precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
## classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
## pipeline


### Data

In [3]:
train = pd.read_csv('./emails.train.csv')
text = train['text']

### Define methods 

In [4]:
re_tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b') # tokenize words that are not numbers
mystopwords = set(stopwords.words('english')) # stop words remover
extras = set(['_', 'subject']) # remove subject and _
mystopwords.update(extras)
ps = PorterStemmer() # stemming words
#stemmer = SnowballStemmer("english") # stemming words
lemmatizer = WordNetLemmatizer() # lemmatizing words

### Lemmatizing words

In [24]:
## First, lemmatize and create list of words
def lemma(df):
    # lemmatize 
    for i, line in enumerate(df['text']): 
        newline = [] 
        words = line.split() 
        for word in words: 
            word = lemmatizer.lemmatize(word)
            newline.append(word)

        new = ' '.join(newline)  
        df.loc[i, 'text'] = new # replace line at the index position with the new line that is the lemmatized words


lemma(train)

### Split set for cross validation

In [38]:
## data
train = pd.read_csv('./emails.train.csv')
test  = pd.read_csv('./emails.test.csv')

lemma(train)
lemma(test)


# Get labels splitted set
subtrain_X, subval_X = train_test_split(train, test_size = 0.25, )

Y_train = train['spam']
Y_test = test['spam']

### Tfidf and feature extraction

In [29]:
vectorizer = TfidfVectorizer(stop_words=mystopwords, tokenizer=re_tokenizer.tokenize, max_features = 2000)
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text']).todense() ## features
X_test = vectorizer.transform(test['text']).todense() ## features


### Model 1: KNN Classifier

In [30]:
## First, grid search to look for the optimal parameters

knn = KNeighborsClassifier()
params = {
    'n_neighbors': [1, 2, 5],
    'p': [1, 2]
}

grid_search = GridSearchCV(knn,params,n_jobs=-1) ## -1 is faster

grid_search.fit(X_train, Y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)


{'n_neighbors': 5, 'p': 2}
0.96891320567


#### Best parameters

In [9]:
estimator = grid_search.best_estimator_ 

#### Prediction

In [10]:
Y_pred = estimator.predict(X_test)

#### Write to csv solution

In [11]:
pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.knn.csv', index=False)

### Model 2 Decision Tree

In [31]:
dtree = tree.DecisionTreeClassifier()

dtree.fit(X_train, Y_train)

Y_pred = dtree.predict(X_test)

print(accuracy_score(Y_test, Y_pred))


pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.dtree.csv', index=False)



0.946104276508


### Model 3 Random Forest Classifier

In [32]:
## First, grid search to look for the optimal parameters

rforest = RandomForestClassifier()
params = {
    'n_estimators': [10, 20, 50],
    'max_depth': [1, 2, 10]
}

grid_search = GridSearchCV(rforest,params, n_jobs=-1)

grid_search.fit(X_train, Y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 10, 'n_estimators': 20}
0.922904750062


#### Best parameters

In [14]:
estimator = grid_search.best_estimator_ 

#### Prediction

In [15]:
Y_pred = estimator.predict(X_test)

#### Write to csv solution

In [16]:
pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.rforest.csv', index=False)

### Model 4 SVM 

In [33]:
svm = SVC()

svm.fit(X_train, Y_train) 

Y_pred = svm.predict(X_test)

print(accuracy_score(Y_test, Y_pred))

## write to csv

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.svm.csv', index=False)

0.756883421207


### Model 5 Multinomial Naive Bayes

In [34]:
nb = MultinomialNB()

nb.fit(X_train, Y_train)

Y_pred = nb.predict(X_test)

print(accuracy_score(Y_test, Y_pred))

## write to csv

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.nb.csv', index=False)

0.976567076743


### Model 6 Gaussian Process Classifier 

In [35]:
gp = GaussianProcessClassifier()

gp.fit(X_train, Y_train)

Y_pred = gp.predict(X_test)

print(accuracy_score(Y_test, Y_pred))

## write to csv

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('predictions.gp.csv', index=False)

0.9589923843


In [None]:
Predictions 