In [None]:
import pickle
import numpy as np
from sklearn import svm
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time

In [None]:


# features_train
path_features_train = "pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)



In [None]:
print(features_train.shape)
print(features_test.shape)

# Simple training

Best for text only: C=1.5, kernel=linear
Best text+len+votes: C=1, kernel=linear

In [None]:


def train_svm(C, ker):
    t = time.time()
    svc = svm.SVC(C=C, kernel=ker, class_weight={0: 0.25, 1:0.75})
    svc.fit(features_train, labels_train)
    svc_pred = svc.predict(features_test)
    print("Training time", time.time()-t)
    print("The test accuracy is: ", accuracy_score(labels_test, svc_pred))

    print("Classification report")
    print(classification_report(labels_test,svc_pred))
    conf_matrix = confusion_matrix(labels_test, svc_pred, normalize='true')
    plt.figure(figsize=(12.8,6))
    sns.heatmap(conf_matrix, 
                annot=True,
                xticklabels=[0,1], 
                yticklabels=[0,1],
                cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion matrix')
    plt.show()

In [None]:
train_svm(1., 'linear')

# Old trainings

In [None]:
for C in [1, 1.5, 2]:
    for ker in ['poly', 'rbf', 'linear']:
        print("\n\n>>>> TRAINIG WITH PARAMS ", str(C), ker)
        train_svm(C, ker)

# Test variation number features

In [None]:
import pickle
# df
with open("pickles/df.pickle", 'rb') as data:
    df = pickle.load(data)

df['cleaned_all']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
vectorizer = TfidfVectorizer(lowercase=False, max_features=50)


X_train, X_test, labels_train, labels_test = train_test_split(np.array(df['cleaned_all']), 
                                                    np.array(df['requester_received_pizza']).astype(int), 
                                                    test_size=0.15, 
                                                    random_state=8)


for feats in [100, 200, 500, 1000, 2000]:
    print("\n\n>>>> TRAINIG WITH NUMBER FEATURES ", feats)
    vectorizer = TfidfVectorizer(stop_words=None,
                            lowercase=False,
                            max_features=feats)

    features_train = vectorizer.fit_transform(X_train).toarray()
    features_test = vectorizer.transform(X_test).toarray()
    
    t = time.time()
    svc = svm.SVC(C=1, kernel="linear", class_weight={0: 0.25, 1:0.75})
    svc.fit(features_train, labels_train)
    svc_pred = svc.predict(features_test)
    print("Training time", time.time()-t)
    print("The test accuracy is: ", accuracy_score(labels_test, svc_pred))

    print("Classification report")
    print(classification_report(labels_test,svc_pred))
    conf_matrix = confusion_matrix(labels_test, svc_pred, normalize='true')
    plt.figure(figsize=(12.8,6))
    sns.heatmap(conf_matrix, 
                annot=True,
                xticklabels=[0,1], 
                yticklabels=[0,1],
                cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion matrix')
    plt.show()
    