# SVM

In [1]:
import pandas as pd
import numpy as np

In [2]:
tweets = pd.read_csv("Data/all_tweets.csv")

In [3]:
tweets.head()

Unnamed: 0,tweet,label
0,Good luck to all Fury-Haney players playing th...,0
1,@user @user @user awe!!! #cnn so bias and does...,0
2,@user don't leave me,3
3,Odd watching #Antifa extremists going full spe...,0
4,"Really.....#Jumanji 2....w/ The Rock, Jack Bla...",0


## Imports

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from IPython.display import clear_output
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import time

# Hyper-parameter tuning

Given the number of parameters we are going to try and the size of the data, sklearn's grid-search is slower than a manual grid-search where we save the parameters to da csv file. We will perform 4 different grid-searches, one for each method to handle the imbalanced data. 

In [5]:
def grid_search_svc(C, gammas, kernels, stopwords, ngrams, X_train, y_train, folds = 3):
    from sklearn.pipeline import Pipeline
    # Variables for the status
    tot = len(C)*len(gammas)*len(kernels)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    reg = []
    gm = []
    ker = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for c in C:
                for gamma in gammas:
                    for kernel in kernels:
                        #TFIDF
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # SVM
                        svc = SVC(kernel = kernel, C= c, gamma=gamma, random_state=1492)
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf',TFIDF),('ss',ss),('svc',svc)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
    
                        # Saving variables
                        reg.append(c)
                        gm.append(gamma)
                        ker.append(kernel)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                        
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")
            
    return pd.DataFrame([reg, gm, ker, stwds, ngs, err, ttime], 
                index = ["regularization", "gamma", "kernel", "stopwords", "n_grams", "err", "time"]).transpose()

In [6]:
def grid_search_svc_RUS(C, gammas, kernels, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline
    from imblearn.under_sampling import RandomUnderSampler
    # Variables for the status
    tot = len(C)*len(gammas)*len(kernels)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    reg = []
    gm = []
    ker = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for c in C:
                for gamma in gammas:
                    for kernel in kernels:
                        #TFIDF
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # Undersampling
                        us = RandomUnderSampler(random_state=1492)
                        
                        # SVM
                        svc = SVC(kernel = kernel, C= c, gamma=gamma, random_state=1492)
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf',TFIDF),('ss',ss),('us',us),('svc',svc)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        reg.append(c)
                        gm.append(gamma)
                        ker.append(kernel)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                        
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")
            
    return pd.DataFrame([reg, gm, ker, stwds, ngs, err, ttime], 
                index = ["regularization", "gamma", "kernel", "stopwords", "n_grams", "err", "time"]).transpose()

In [7]:
def grid_search_svc_ROS(C, gammas, kernels, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline
    from imblearn.over_sampling import RandomOverSampler 
    # Variables for the status
    tot = len(C)*len(gammas)*len(kernels)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    reg = []
    gm = []
    ker = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for c in C:
                for gamma in gammas:
                    for kernel in kernels:
                        #TFIDF
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # Undersampling
                        os = RandomOverSampler (random_state=1492)
                        
                        # SVM
                        svc = SVC(kernel = kernel, C= c, gamma=gamma, random_state=1492)
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf',TFIDF),('ss',ss),('os',os),('svc',svc)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        reg.append(c)
                        gm.append(gamma)
                        ker.append(kernel)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                        
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")
            
    return pd.DataFrame([reg, gm, ker, stwds, ngs, err, ttime], 
                index = ["regularization", "gamma", "kernel", "stopwords", "n_grams", "err", "time"]).transpose()

In [8]:
def grid_search_svc_SMOTE(C, gammas, kernels, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline
    from imblearn.over_sampling import RandomOverSampler 
    # Variables for the status
    tot = len(C)*len(gammas)*len(kernels)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    reg = []
    gm = []
    ker = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for c in C:
                for gamma in gammas:
                    for kernel in kernels:
                        #TFIDF
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # Undersampling
                        sm = SMOTE(random_state=1492)
                        
                        # SVM
                        svc = SVC(kernel = kernel, C= c, gamma=gamma, random_state=1492)
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf',TFIDF),('ss',ss),('sm',sm),('svc',svc)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
   
                        # Saving variables
                        reg.append(c)
                        gm.append(gamma)
                        ker.append(kernel)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                        
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")
            
    return pd.DataFrame([reg, gm, ker, stwds, ngs, err, ttime], 
                index = ["regularization", "gamma", "kernel", "stopwords", "n_grams", "err", "time"]).transpose()

In [9]:
# Read data
X_train = pd.read_csv("Data/X_train.csv")
y_train = pd.read_csv("Data/y_train.csv")

## Original data 

In [10]:
C = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 1.1, 1.5, 2, 2.5, 3]
gamma = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
stopwords = [None, "english"]
ngrams = [1, 2]
results_og = grid_search_svc(C, gamma, kernels, stopwords, ngrams, X_train, y_train)

100.0 % done
Estimated remaining time: 0.0 hours
1.2830339633756214 elapsed hours


In [11]:
results_og.to_csv("Grid-Search/SVM_OG.csv", index=False)

In [12]:
results_og.sort_values("err", ascending = False)

Unnamed: 0,regularization,gamma,kernel,stopwords,n_grams,err,time
495,0.25,0.010,sigmoid,english,1,0.664961,0.586093
715,0.25,0.010,sigmoid,english,2,0.657151,0.806045
515,0.50,0.010,sigmoid,english,1,0.655616,0.526541
735,0.50,0.010,sigmoid,english,2,0.652907,0.737041
535,0.75,0.010,sigmoid,english,1,0.646878,0.493064
...,...,...,...,...,...,...,...
813,1.50,0.010,poly,english,2,0.191386,1.215224
673,0.01,0.010,poly,english,2,0.191386,1.262700
677,0.01,0.100,poly,english,2,0.191386,1.212605
817,1.50,0.100,poly,english,2,0.191386,1.226379


## Data undersampling

In [14]:
C = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 1.1, 1.5, 2, 2.5, 3]
gamma = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
stopwords = [None, "english"]
ngrams = [1, 2]
results_us = grid_search_svc_RUS(C, gamma, kernels, stopwords, ngrams, X_train, y_train, folds = 3)

100.0 % done
Estimated remaining time: 0.0 hours
0.2923159169488483 elapsed hours


In [15]:
results_us.to_csv("Grid-Search/SVM_US.csv", index=False)

In [16]:
results_us.sort_values("err", ascending = False)

Unnamed: 0,regularization,gamma,kernel,stopwords,n_grams,err,time
475,0.10,0.01000,sigmoid,english,1,0.619260,0.149054
695,0.10,0.01000,sigmoid,english,2,0.616036,0.239906
715,0.25,0.01000,sigmoid,english,2,0.601831,0.227167
35,0.10,0.01000,sigmoid,,1,0.598248,0.211632
495,0.25,0.01000,sigmoid,english,1,0.598088,0.139618
...,...,...,...,...,...,...,...
181,2.50,0.00001,poly,,1,0.093302,0.210083
521,0.75,0.00001,poly,english,1,0.093302,0.148649
5,0.01,0.00010,poly,,1,0.093302,0.226817
201,3,0.00001,poly,,1,0.093302,0.210097


## Data oversampling

In [17]:
C = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 1.1, 1.5, 2, 2.5, 3]
gamma = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
stopwords = [None, "english"]
ngrams = [1, 2]
results_os = grid_search_svc_ROS(C, gamma, kernels, stopwords, ngrams, X_train, y_train)

100.0 % done
Estimated remaining time: 0.0 hours
2.294724991387791 elapsed hours


In [18]:
results_os.to_csv("Grid-Search/SVM_OS.csv", index=False)

In [19]:
results_os.sort_values("err", ascending = False)

Unnamed: 0,regularization,gamma,kernel,stopwords,n_grams,err,time
475,0.10,0.01000,sigmoid,english,1,0.651351,1.766510
695,0.10,0.01000,sigmoid,english,2,0.647209,2.304471
691,0.10,0.00100,sigmoid,english,2,0.639899,2.833049
31,0.10,0.00100,sigmoid,,1,0.639225,3.489790
471,0.10,0.00100,sigmoid,english,1,0.637826,2.164464
...,...,...,...,...,...,...,...
482,0.25,0.00001,rbf,english,1,0.155178,2.325765
462,0.10,0.00001,rbf,english,1,0.155178,2.341759
42,0.25,0.00001,rbf,,1,0.153187,3.693490
2,0.01,0.00001,rbf,,1,0.153187,3.692264


## SMOTE data

In [21]:
C = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 1.1, 1.5, 2, 2.5, 3]
gamma = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
stopwords = [None, "english"]
ngrams = [1, 2]
results_sm = grid_search_svc_SMOTE(C, gamma, kernels, stopwords, ngrams, X_train, y_train)

100.0 % done
Estimated remaining time: 0.0 hours
2.692610364158948 elapsed hours


In [22]:
results_sm.to_csv("Grid-Search/SVM_SM.csv", index=False)

In [23]:
results_sm.sort_values("err", ascending = False)

Unnamed: 0,regularization,gamma,kernel,stopwords,n_grams,err,time
475,0.10,0.01000,sigmoid,english,1,0.648361,1.503104
495,0.25,0.01000,sigmoid,english,1,0.647353,1.272846
711,0.25,0.00100,sigmoid,english,2,0.646565,1.973819
715,0.25,0.01000,sigmoid,english,2,0.643317,1.560709
695,0.10,0.01000,sigmoid,english,2,0.637596,1.863010
...,...,...,...,...,...,...,...
641,3,0.00001,poly,english,1,0.105940,2.462685
481,0.25,0.00001,poly,english,1,0.105940,2.468608
521,0.75,0.00001,poly,english,1,0.105940,2.462919
621,2.50,0.00001,poly,english,1,0.105940,2.461200
