In [None]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Dataframe
path_df = "pickles/df.pickle"
with open(path_df, 'rb') as data:
    df = pickle.load(data)

# features_train
path_features_train = "pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)



In [None]:
print(features_train.shape)
print(features_test.shape)

# Simple training

In [None]:
svc = svm.SVC(kernel='linear', class_weight={0: 0.25, 1:0.75})
svc.fit(features_train, labels_train)

In [None]:
svc_pred = svc.predict(features_test)

In [None]:
print("The training accuracy is: ", accuracy_score(labels_train, svc.predict(features_train)))
print("The test accuracy is: ", accuracy_score(labels_test, svc_pred))

print("Classification report")
print(classification_report(labels_test,svc_pred))


In [None]:
conf_matrix = confusion_matrix(labels_test, svc_pred, normalize='true')
plt.figure(figsize=(12.8,6))
sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=[0,1], 
            yticklabels=[0,1],
            cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix')
plt.show()

# More complex training

## Randomized search cross-val

In [None]:
import time
# C
C = [.0001, .001, .01]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']


# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
             }



# First create the base model to tune
svc = svm.SVC(class_weight='balanced')

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='balanced_accuracy',
                                   cv=3, 
                                   verbose=1)

t = time.time()
# Fit the random search model
random_search.fit(features_train, labels_train)
print('Finished search grid in', time.time()-t)


In [None]:


print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)



In [None]:


# Create the parameter grid based on the results of random search 
C = [.00001, .0001, .001]
degree = [3, 4, 5]
gamma = [0.1, 1, 10]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'degree':degree},
  {'C': C, 'kernel':['linear'], 'gamma':gamma}
]

# Create a base model
svc = svm.SVC(class_weight='balanced')

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)



In [None]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

### Apply best params

In [None]:
svc = svm.SVC(kernel='linear', class_weight={0: 0.25, 1:0.75}, gamma=1, degree=4, C=0.0001)
svc.fit(features_train, labels_train)
svc_pred = svc.predict(features_test)
print("The training accuracy is: ", accuracy_score(labels_train, svc.predict(features_train)))
print("The test accuracy is: ", accuracy_score(labels_test, svc_pred))

print("Classification report")
print(classification_report(labels_test,svc_pred))

In [None]:
conf_matrix = confusion_matrix(labels_test, svc_pred, normalize='true')
plt.figure(figsize=(12.8,6))
sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=[0,1], 
            yticklabels=[0,1],
            cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix')
plt.show()

# Prediction

## Pre-processing functions

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

punctuation = list(",.?!(){}[]-_\"'\\;:+*<>@#§^$%&|/") + ['\n', '\r', '\t', '...', '..']
stop_words = set(stopwords.words('english'))
stop_words.add("request")
stop_words.add("edit")

lemmatizer = WordNetLemmatizer()
tag_dict = {"J": wn.ADJ,
            "N": wn.NOUN,
            "V": wn.VERB,
            "R": wn.ADV}

def extract_wnpostag_from_postag(tag):
    #take the first letter of the tag
    #the second parameter is an "optional" in case of missing key in the dictionary 
    return tag_dict.get(tag[0].upper(), None)

def lemmatize_tupla_word_postag(tupla):
    """
    giving a tupla of the form (wordString, posTagString) like ('guitar', 'NN'), return the lemmatized word
    """
    tag = extract_wnpostag_from_postag(tupla[1])    
    return lemmatizer.lemmatize(tupla[0], tag) if tag is not None else tupla[0]

def correspondance_miswrite(word):
    if word == "im":
        return "i'm"
    elif word == "ive":
        return "i've"

def clean_text(sentence):
    sentence = sentence.lower()
    original_words = word_tokenize(sentence)
    tagged_words = nltk.pos_tag(original_words) #returns a list of tuples: (word, tagString) like ('And', 'CC')
    lemmatized_words = [ lemmatize_tupla_word_postag(ow) for ow in tagged_words ]
    cleaned_words = [ 
        w for w in lemmatized_words if (w not in punctuation) and (w not in stop_words)
    ]
    return ' '.join(cleaned_words)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open('pickles/vectorizer.pickle', 'rb') as data:
    vectorizer = pickle.load(data)


## Input

In [None]:
text = "Just started my new job and my paycheck hasn't rolled in yet.\
I am down to my last dollar now. Would love a pizza in these trying times. I have held strong for 3 months.\n\n\
I do also intent to pay-it-forward when I can afford it in a couple of months. Much appreciated!\n\n\
Edit: I failed to mention I am in Toronto! Nearby pizza chains include 241, Dominoes and Pizza Pizza."

In [None]:
inputt = [clean_text(text)]

In [None]:
features_input = vectorizer.transform(inputt).toarray()

In [None]:
svc.predict(features_input)