# Multilayer perceptron (MLP)

In [1]:
import pandas as pd
import numpy as np

In [2]:
tweets = pd.read_csv("Data/all_tweets.csv")

In [3]:
tweets.head()

Unnamed: 0,tweet,label
0,Good luck to all Fury-Haney players playing th...,0
1,@user @user @user awe!!! #cnn so bias and does...,0
2,@user don't leave me,3
3,Odd watching #Antifa extremists going full spe...,0
4,"Really.....#Jumanji 2....w/ The Rock, Jack Bla...",0


## Bulding the network

In [4]:
import torch
from torch import nn
import torch.nn.functional as F

In [5]:
class MLP_relu(nn.Module):
    def __init__(self, input_dim=10, num_hidden=1, hidden_dim=100, output_dim=4, dropout=0.5):
        """
        input_dim: Number of cells in the input layer
        num_hidden: Number of hidden layers
        hidden_dim: Number of cells in each hidden_layer
        output_dim: Number of cells in the output dimension 
        dropout: 
        """
        # Building the network from here
        super(MLP_relu, self).__init__()
        
        # Class attributes
        self.input_dim = input_dim
        self.num_hidden = num_hidden
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout = nn.Dropout(dropout)
        
        # Hidden layers
        hidden = [nn.Linear(input_dim, hidden_dim) if i==0 else nn.Linear(hidden_dim, hidden_dim) for i in range(num_hidden)]
        self.linears = nn.ModuleList(hidden)
        
        # Output layer
        self.ol = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, data, **kwargs):
        # To float
        X = data.float()
        
        # Hidden layers
        for i, hl in enumerate(self.linears):
            X = self.linears[i](X)
            X = F.relu(X)
            X = self.dropout(X)
        
        # Output layer
        out = self.ol(X)
        out = F.softmax(out, dim = -1)
        
        return out

In [6]:
from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping
torch.manual_seed(1492)

<torch._C.Generator at 0x7fe1b1dd6f70>

In [7]:
# To automatically detect input size
# https://github.com/skorch-dev/skorch/issues/584
class MyNet(NeuralNetClassifier):
    def check_data(self, X, y):
        super().check_data(X, y)
        if self.module_.input_dim != X.shape[1]:
            self.set_params(module__input_dim=X.shape[1])
            self.initialize()

## Imports

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from IPython.display import clear_output
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import time

# Hyper-parameter tuning

Given the number of parameters we are going to try and the size of the data, sklearn's grid-search is slower than a manual grid-search where we save the parameters to da csv file. We will perform 4 different grid-searches, one for each method to handle the imbalanced data. 

In [9]:
def grid_search_mlp(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3):
    from sklearn.pipeline import Pipeline
    # Variables for the status
    tot = len(hidden_layers)*len(hidden_size)*len(learning_rate)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    hidl = []
    hids = []
    lrs = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for hl in hidden_layers: 
                for hs in hidden_size: 
                    for lr in learning_rate:
                        # Vectorizer
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # MLP
                        MLP_net = MyNet(
                                MLP_relu(num_hidden=hl, hidden_dim=hs, output_dim = 4, dropout=0.2),
                                max_epochs=50,
                                callbacks=[EarlyStopping()],
                                lr=lr,
                                batch_size = 64,
                                device='cpu',
                                verbose=0
                                )
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf', TFIDF), ('ss', ss), ('mlp', MLP_net)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        hidl.append(hl)
                        hids.append(hs)
                        lrs.append(lr)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                            
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")

    return pd.DataFrame([hidl, hids, lrs, stwds, ngs, err, ttime], 
                index = ["hidden_layers", "hidden_size", "learning_rate", "stopwords", "ngrams", "err", "time"]).transpose()

In [10]:
def grid_search_mlp_RUS(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline 
    from imblearn.under_sampling import RandomUnderSampler
    # Variables for the status
    tot = len(hidden_layers)*len(hidden_size)*len(learning_rate)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    hidl = []
    hids = []
    lrs = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for hl in hidden_layers: 
                for hs in hidden_size: 
                    for lr in learning_rate:
                        # Vectorizer
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # Undersampling
                        us = RandomUnderSampler(random_state=1492)
                        
                        # MLP
                        MLP_net = MyNet(
                                MLP_relu(num_hidden=hl, hidden_dim=hs, output_dim = 4, dropout=0.2),
                                max_epochs=50,
                                callbacks=[EarlyStopping()],
                                lr=lr,
                                batch_size = 64,
                                device='cpu',
                                verbose=0
                                )
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf', TFIDF), ('ss', ss), ('us', us), ('mlp', MLP_net)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        hidl.append(hl)
                        hids.append(hs)
                        lrs.append(lr)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                            
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")

    return pd.DataFrame([hidl, hids, lrs, stwds, ngs, err, ttime], 
                index = ["hidden_layers", "hidden_size", "learning_rate", "stopwords", "ngrams", "err", "time"]).transpose()

In [11]:
def grid_search_mlp_ROS(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline 
    from imblearn.over_sampling import RandomOverSampler 
    # Variables for the status
    tot = len(hidden_layers)*len(hidden_size)*len(learning_rate)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    hidl = []
    hids = []
    lrs = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for hl in hidden_layers: 
                for hs in hidden_size: 
                    for lr in learning_rate:
                        # Vectorizer
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # Oversampling
                        os = RandomOverSampler(random_state=1492)
                        
                        # MLP
                        MLP_net = MyNet(
                                MLP_relu(num_hidden=hl, hidden_dim=hs, output_dim = 4, dropout=0.2),
                                max_epochs=50,
                                callbacks=[EarlyStopping()],
                                lr=lr,
                                batch_size = 64,
                                device='cpu',
                                verbose=0
                                )
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf', TFIDF), ('ss', ss), ('os', os), ('mlp', MLP_net)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        hidl.append(hl)
                        hids.append(hs)
                        lrs.append(lr)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                            
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")

    return pd.DataFrame([hidl, hids, lrs, stwds, ngs, err, ttime], 
                index = ["hidden_layers", "hidden_size", "learning_rate", "stopwords", "ngrams", "err", "time"]).transpose()

In [12]:
def grid_search_mlp_SMOTE(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3):
    from imblearn.pipeline import Pipeline 
    from imblearn.over_sampling import SMOTE 
    # Variables for the status
    tot = len(hidden_layers)*len(hidden_size)*len(learning_rate)*len(stopwords)*len(ngrams)
    i = 0
    t0 = time.time()
    
    # List to save the output
    hidl = []
    hids = []
    lrs = []
    stwds = []
    ngs = []
    err = []
    ttime = []
    
    for st in stopwords:
        for ng in ngrams:
            for hl in hidden_layers: 
                for hs in hidden_size: 
                    for lr in learning_rate:
                        # Vectorizer
                        TFIDF = TfidfVectorizer(stop_words = st,  ngram_range=(1, ng))
                        
                        # Scaler
                        ss = StandardScaler(with_mean=False)
                        
                        # SMOTE
                        sm = SMOTE(random_state=1492)
                        
                        # MLP
                        MLP_net = MyNet(
                                MLP_relu(num_hidden=hl, hidden_dim=hs, output_dim = 4, dropout=0.2),
                                max_epochs=50,
                                callbacks=[EarlyStopping()],
                                lr=lr,
                                batch_size = 64,
                                device='cpu',
                                verbose=0
                                )
                        
                        # Pipeline
                        pipe = Pipeline([('tfidf', TFIDF), ('ss', ss), ('sm', sm), ('mlp', MLP_net)])
                        
                        # Cross validation
                        scores = cross_validate(pipe, X_train.values.ravel(), y_train.values.ravel(), scoring='f1_weighted', cv=folds)
                        
                        # Saving variables
                        hidl.append(hl)
                        hids.append(hs)
                        lrs.append(lr)
                        if st is not None:
                            stwds.append(st)
                        else:
                            stwds.append("None")
                        ngs.append(ng)
                        err.append(np.mean(scores['test_score']))
                        ttime.append(np.mean(scores['fit_time']))
                            
                        # Print status
                        i = i + 1
                        t = time.time()-t0
                        clear_output(wait=True)
                        total = (tot/i)*(t/(60*60))
                        print(i/tot*100,"% done")
                        print("Estimated remaining time:", round(total-t/(60*60),3), "hours")
                        print(t/(60*60),"elapsed hours")

    return pd.DataFrame([hidl, hids, lrs, stwds, ngs, err, ttime], 
                index = ["hidden_layers", "hidden_size", "learning_rate", "stopwords", "ngrams", "err", "time"]).transpose()

In [13]:
X_train = pd.read_csv("Data/X_train.csv")
y_train = pd.read_csv("Data/y_train.csv")

## Original data 

In [15]:
stopwords = [None, "english"]
ngrams = [1, 2]
hidden_layers = [1, 10, 100, 1000]
hidden_size = [1, 10, 100, 1000]
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
results_og = grid_search_mlp(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3)

100.0 % done
Estimated remaining time: 0.0 hours
5.410855841173066 elapsed hours


In [16]:
results_og.to_csv("Grid-Search/MLP_OG.csv", index=False)

In [17]:
results_og.sort_values("err", ascending = False)

Unnamed: 0,hidden_layers,hidden_size,learning_rate,stopwords,ngrams,err,time
11,1,100,0.05,,1,0.592085,7.183023
21,10,1,0.05,,1,0.591318,7.929810
190,10,100,0.01,english,1,0.591157,27.119740
211,100,100,0.05,english,1,0.590522,8.160554
196,10,1000,0.05,english,1,0.590521,8.184138
...,...,...,...,...,...,...,...
109,10,10,1,,2,0.392635,10.439503
144,1000,1,1,,2,0.388884,10.488548
139,100,1000,1,,2,0.384993,10.658334
149,1000,10,1,,2,0.379850,10.475542


## Data undersampling

In [20]:
stopwords = [None, "english"]
ngrams = [1, 2]
hidden_layers = [1, 10, 100, 1000]
hidden_size = [1, 10, 100, 1000]
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
results_us = grid_search_mlp_RUS(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3)

100.0 % done
Estimated remaining time: 0.0 hours
2.526733169224527 elapsed hours


In [21]:
results_us.to_csv("Grid-Search/MLP_US.csv", index=False)

In [22]:
results_us.sort_values("err", ascending = False)

Unnamed: 0,hidden_layers,hidden_size,learning_rate,stopwords,ngrams,err,time
197,10,1000,0.10,english,1,0.462061,3.420971
176,1,1000,0.05,english,1,0.459824,5.049016
192,10,100,0.10,english,1,0.458377,3.811890
61,1000,1,0.05,,1,0.457759,5.919978
51,100,100,0.05,,1,0.457756,5.559218
...,...,...,...,...,...,...,...
118,10,1000,0.50,,2,0.321363,3.959341
94,1,100,1,,2,0.321039,4.359269
149,1000,10,1,,2,0.315730,3.975524
89,1,10,1,,2,0.307146,4.167748


## Data oversampling

In [30]:
stopwords = [None, "english"]
ngrams = [1, 2]
hidden_layers = [1, 10, 100, 1000]
hidden_size = [1, 10, 100, 1000]
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
results_os = grid_search_mlp_ROS(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3)

100.0 % done
Estimated remaining time: 0.0 hours
12.259407487776544 elapsed hours


In [31]:
results_os.to_csv("Grid-Search/MLP_OS.csv", index=False)

In [32]:
results_os.sort_values("err", ascending = False)

Unnamed: 0,hidden_layers,hidden_size,learning_rate,stopwords,ngrams,err,time
170,1,100,0.01,english,1,0.600493,51.495731
195,10,1000,0.01,english,1,0.599556,53.446120
230,1000,100,0.01,english,1,0.598831,53.995280
0,1,1,0.01,,1,0.597562,43.685045
45,100,10,0.01,,1,0.596482,59.614981
...,...,...,...,...,...,...,...
154,1000,100,1,,2,0.336649,43.236211
129,100,10,1,,2,0.330946,37.492469
99,1,1000,1,,2,0.317430,49.395968
124,100,1,1,,2,0.313934,43.729687


## SMOTE data

In [27]:
stopwords = [None, "english"]
ngrams = [1, 2]
hidden_layers = [1, 10, 100, 1000]
hidden_size = [1, 10, 100, 1000]
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
results_sm = grid_search_mlp_SMOTE(hidden_layers, hidden_size, learning_rate, stopwords, ngrams, X_train, y_train, folds = 3)

100.0 % done
Estimated remaining time: 0.0 hours
10.711227911975648 elapsed hours


In [28]:
results_sm.to_csv("Grid-Search/MLP_SM.csv", index=False)

In [29]:
results_sm.sort_values("err", ascending = False)

Unnamed: 0,hidden_layers,hidden_size,learning_rate,stopwords,ngrams,err,time
205,100,10,0.01,english,1,0.601016,52.654019
231,1000,100,0.05,english,1,0.600227,15.264734
66,1000,10,0.05,,1,0.600118,15.023063
11,1,100,0.05,,1,0.600030,13.553564
36,10,1000,0.05,,1,0.599616,23.903095
...,...,...,...,...,...,...,...
154,1000,100,1,,2,0.412521,32.502520
89,1,10,1,,2,0.412465,23.848836
29,10,10,1,,1,0.412335,10.591240
109,10,10,1,,2,0.406074,21.278053
