In [1]:
# Imports

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data

data = pd.read_csv("data.csv", sep=",")

In [3]:
# Define function for model training

def run(model, trainData, trainLabels, testData, testLabels):
    model.fit(trainData, trainLabels)
    cvScores = cross_val_score(model, trainData, trainLabels, cv=5)
    cvMeanScore = np.mean(cvScores)
    print("CV Accuracy: {0}".format(cvScores))
    print("Mean CV Accuracy: {0}".format(cvMeanScore))
    testScore = model.score(testData, testLabels)
    print("Test Accuracy: {0}".format(testScore))
    
    return model

In [4]:
# Split data into train and test

trainCorpus, testCorpus, trainLabels, testLabels = train_test_split(
    data.iloc[:, 1:],
    data["Category"],
    test_size = 0.5,
    random_state = 10
)

In [5]:
tv = TfidfVectorizer()

In [6]:
trainVec = tv.fit_transform(trainCorpus.Message)
testVec = tv.transform(testCorpus.Message)

In [7]:
# Params found through tuning below

svm = SVC(kernel = "rbf", tol = 1e-3, C = 140, gamma = 6e-3, random_state = 24, max_iter = 4000)

In [8]:
run(svm, trainVec, trainLabels, testVec, testLabels)

CV Accuracy: [0.9874552  0.98204668 0.98384201 0.98384201 0.97845601]
Mean CV Accuracy: 0.9831283823349614
Test Accuracy: 0.9820531227566404


SVC(C=140, gamma=0.006, max_iter=4000, random_state=24)

In [None]:
# Tune parameters by cross validation

tunedParams = [{"C": [130, 135, 140, 145, 150, 155, 160, 165, 170], "gamma": [4e-3, 4.5e-3, 5e-3, 5.5e-3, 6e-3]}]
scores = ["precision", "recall"]

for score in scores:
    print("# Tuning hyper-params for %s" % score)
    print()
    
    clf = GridSearchCV(SVC(), tunedParams, scoring = "%s_macro" % score)
    clf.fit(trainVec, trainLabels)
    
    print("Best params found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print()
    
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set")
    print("The scores are computed on the full evaluation set")
    print()
    yTrue, yPred = testLabels, clf.predict(testVec)
    print(classification_report(yTrue, yPred))
    print()

# Tuning hyper-params for precision

