
## Basic Neural Network Sample
Working with  **skorch**. Consult with the [documentation](https://skorch.readthedocs.io/en/stable/).

# Installing skorch and loading libraries

In [1]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [3]:
#Check if torch works with version verification
print(torch.__version__)

2.1.0


In [4]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [5]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [6]:
with open(f'x_train.txt', encoding = 'utf-8') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt', encoding = 'utf-8') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt', encoding = 'utf-8') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt', encoding = 'utf-8') as f:
    y_test = f.read().splitlines()

In [7]:
import pandas as pd
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})

#combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

In [8]:
# T: Please use again the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)
# and adjust the train/test split if needed

##Re-utilizing the saved .csv dataset we build in part 1
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})
# write train_df to csv with tab as separator
train_df.to_csv('train_df_second.csv', index=False, sep='\t')
# combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})
# write test_df to csv with tab as separator
test_df.to_csv('test_df_second.csv', index=False, sep='\t')

In [9]:
## Checking the dataset... it works, still the same a the previous one
train_df.head()

Unnamed: 0,text,label
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai
3,"Après lo cort periòde d'establiment a Basilèa,...",oci
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha


In [10]:
## Checking the test set... fine as well
test_df.head()

Unnamed: 0,text,label
0,Ne l fin de l seclo XIX l Japon era inda çconh...,mwl
1,Schiedam is gelegen tussen Rotterdam en Vlaard...,nld
2,"ГIурусаз батальонал, гьоркьор гIарадабиги лъун...",ava
3,ರಾಜ್ಯಶಾಸ್ತ್ರದ ಪಿತಾಮಹೆ ಅರಿಸ್ಟಾಟಲ್. ರಾಜ್ಯಶಾಸ್ತ್ರ...,tcy
4,Halukum adalah kelenjar tiroid nang menonjol d...,bjn


In [11]:
## Apply splitting:
from sklearn.model_selection import train_test_split

#Concatenate first the test and training
whole_df = pd.concat((train_df, test_df))
#Now we create a mixed set for training and testing:
new_train_df, new_test_df = train_test_split(whole_df, test_size=0.2, stratify=whole_df['label'])

In [12]:
##Repeat:
#Full subset with the 6 target languages and the 20 plus
same_languages_filter = ['eng', 'deu', 'nld', 'dan', 'swe', 'nno', 'est',
                      'zea', 'hat', 'tam', 'pan', 'ton', 'bod', 'arg', 
                      'jpn', 'tur', 'pol', 'mkd', 'ace', 'bak', 'hun', 
                      'ukr', 'san', 'nap', 'lug', 'sun']
print(len(same_languages_filter))

26


In [13]:
## Split a test and training set with the new labels
newdataframe_sub_training = new_train_df[new_train_df['label'].isin(same_languages_filter)]
newdataframe_sub_testing = new_test_df[new_test_df['label'].isin(same_languages_filter)]

In [14]:
## T: use your adjusted code to encode the labels here ##
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Fitting the labels:
le_fitted = LabelEncoder().fit(newdataframe_sub_training['label'])

# Encoding the labels for training and test
y_train_sub = le_fitted.transform(newdataframe_sub_training['label'])
y_test_sub = le_fitted.transform(newdataframe_sub_testing['label'])

In [15]:
# Transform both text, re-using character analyzer:
# Changed max features to 10 for speed:
vectorization = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=100, binary=True)
X_main = vectorization.fit_transform(newdataframe_sub_training['text'].to_numpy())
X_test = vectorization.transform(newdataframe_sub_testing['text'].to_numpy())

In [16]:
# Reformat the values into np.float32 and int64 for the other kit:
X_main = X_main.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train_sub = y_train_sub.astype(np.int64)
y_test_sub = y_test_sub.astype(np.int64)

In [17]:
# T: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [18]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(100, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 200)
        self.output = nn.Linear(200, 100)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

In [19]:
## Vanilla configuration - Baseline
net = NeuralNetClassifier(
    ClassifierModule,
    train_split=None,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    #device='cuda'  # comment this to train with CPU
)
# we tried to change to lower learning rate but we got Out of memory error. The only thing we could change is the epoch number,
# but after 6 epoch it doesn't seem to improve anymore

In [20]:
net.fit(X_main, y_train_sub)



  epoch    train_loss     dur
-------  ------------  ------
      1        [36m3.0191[0m  2.8903
      2        [36m1.3631[0m  2.8858
      3        [36m0.9827[0m  2.7074
      4        [36m0.8379[0m  2.7412
      5        [36m0.7748[0m  2.8119
      6        [36m0.7395[0m  2.9261
      7        [36m0.7156[0m  3.0651
      8        [36m0.6980[0m  3.0047
      9        [36m0.6840[0m  2.8019
     10        [36m0.6726[0m  2.7664
     11        [36m0.6628[0m  2.8434
     12        [36m0.6544[0m  2.8364
     13        [36m0.6470[0m  2.7809
     14        [36m0.6403[0m  2.9263
     15        [36m0.6343[0m  3.3957
     16        [36m0.6286[0m  2.9422
     17        [36m0.6235[0m  2.8311
     18        [36m0.6187[0m  3.0893
     19        [36m0.6142[0m  2.8021
     20        [36m0.6098[0m  2.7124


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=100, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=200, bias=True)
    (output): Linear(in_features=200, out_features=100, bias=True)
  ),
)

In [21]:
## Final score for baseline:
net.score(X_test, y_test_sub)

## Score is 0.7325 which isn't bad per se
### Required to get at least 0.8...

0.7325

In [40]:
# grid search for hyperparamter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Replicate Pipeline needed:
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=100, binary=True, dtype=np.float32)),
    ('clf', NeuralNetClassifier(
        ClassifierModule,
        criterion=nn.CrossEntropyLoss(),
        lr=0.1,
        #device='cuda'  # comment this to train with CPU
        ))
    ])

In [41]:
## Parameters to check through in the grid for all combinations
## Values are restricted since we are close to 0.80 score needed:
param_grid = {
    "vect__ngram_range": [(1, 1), (2, 2)], # CountVectorizer features
    "clf__module__num_units": [150, 200], # two layer sizes to experiment, just small increments since its almost 0.8
    "clf__module__nonlin": [F.relu, F.tanh], # activation functions
    "clf__optimizer": [torch.optim.Adam, torch.optim.SGD], # optimizers are SGD and ADAM
    "clf__max_epochs": [15, 20], # early stopping for terminate training earlier in order to find best performance at lowest amount of training
}

In [23]:
# Execute the grid check:
## CV at 2 for fastest results and being the minimum k-fold number one can use...
checkgrid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)
grid_best_results = checkgrid.fit(newdataframe_sub_training['text'], y_train_sub)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.4657[0m       [32m0.5457[0m        [35m1.5163[0m  1.9921
      2        [36m1.2959[0m       [32m0.6750[0m        [35m1.1942[0m  1.8810
      3        [36m1.0224[0m       [32m0.6885[0m        [35m1.1840[0m  1.8906
      4        1.0677       [32m0.7587[0m        [35m0.9398[0m  1.8120
      5        [36m0.9158[0m       0.7505        0.9484  1.6686
      6        [36m0.8787[0m       0.7308        0.9922  1.6894
      7        0.9072       0.7303        0.9826  1.7822
      8        0.8837       0.7471        1.0866  1.5959
      9        [36m0.8684[0m       0.7524        2.1568  1.6510
     10        1.0723       [32m0.7678[0m        4.8708  1.6622
     11        1.1232       0.6827        6.8702  1.7058
     12        1.0223       0.7582        2.5807  2.5601
     13        0.9337       0.7538        2.5224  2.9479
     14   

      9        [36m0.8252[0m       [32m0.7000[0m        [35m0.8462[0m  1.4381
     10        [36m0.7897[0m       [32m0.7063[0m        [35m0.8211[0m  1.3302
     11        [36m0.7631[0m       [32m0.7106[0m        [35m0.8041[0m  1.5399
     12        [36m0.7422[0m       [32m0.7135[0m        [35m0.7919[0m  1.5121
     13        [36m0.7250[0m       [32m0.7149[0m        [35m0.7824[0m  1.4155
     14        [36m0.7103[0m       0.7149        [35m0.7753[0m  1.2850
     15        [36m0.6975[0m       [32m0.7192[0m        [35m0.7697[0m  1.3125
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.6867[0m       [32m0.0385[0m        [35m4.0820[0m  1.4881
      2        [36m2.8342[0m       [32m0.2053[0m        [35m3.1777[0m  1.3972
      3        [36m1.9031[0m       [32m0.2740[0m        [35m2.3162[0m  1.2836
      4        [36m1.4500[0m       [32m0.3418[0m   

      2        [36m2.7574[0m       [32m0.1437[0m        [35m3.2862[0m  1.4114
      3        [36m2.2117[0m       [32m0.2861[0m        [35m2.4634[0m  1.4096
      4        [36m1.4969[0m       [32m0.4736[0m        [35m1.5687[0m  1.4647
      5        [36m0.9731[0m       [32m0.6120[0m        [35m1.0661[0m  1.3434
      6        [36m0.7193[0m       [32m0.7389[0m        [35m0.8127[0m  1.3368
      7        [36m0.5878[0m       [32m0.7923[0m        [35m0.6707[0m  1.3613
      8        [36m0.5108[0m       [32m0.8163[0m        [35m0.5862[0m  1.2995
      9        [36m0.4613[0m       [32m0.8404[0m        [35m0.5336[0m  1.5692
     10        [36m0.4271[0m       [32m0.8505[0m        [35m0.4988[0m  1.6626
     11        [36m0.4016[0m       [32m0.8548[0m        [35m0.4730[0m  1.4693
     12        [36m0.3815[0m       [32m0.8587[0m        [35m0.4534[0m  1.3152
     13        [36m0.3649[0m       [32m0.8625[0m        [35m0.4375[0m 

     13        3.2731       0.0385        3.2700  1.9483
     14        3.2730       0.0385        3.2700  1.8767
     15        3.2730       0.0385        3.2699  1.9098
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.6156[0m       [32m0.1976[0m        [35m2.9134[0m  1.3744
      2        [36m2.6548[0m       [32m0.3125[0m        [35m2.3863[0m  1.3273
      3        [36m2.0640[0m       [32m0.6380[0m        [35m1.7142[0m  1.4730
      4        [36m1.3609[0m       [32m0.7937[0m        [35m1.0605[0m  1.3785
      5        [36m0.8757[0m       [32m0.8438[0m        [35m0.7326[0m  1.3221
      6        [36m0.6465[0m       [32m0.8606[0m        [35m0.5780[0m  1.4192
      7        [36m0.5293[0m       [32m0.8659[0m        [35m0.4957[0m  1.3205
      8        [36m0.4616[0m       [32m0.8750[0m        [35m0.4477[0m  1.3752
      9        [36m0.4188[0m       [32m0

      5        2.9256       0.0913        2.8323  1.5029
      6        2.7417       [32m0.1149[0m        2.6881  1.6554
      7        [36m2.6843[0m       [32m0.1293[0m        2.6618  1.5159
      8        [36m2.6825[0m       0.1212        [35m2.6356[0m  1.5484
      9        2.9597       0.0385        3.7363  1.6363
     10        3.3507       0.0385        3.2688  1.5341
     11        3.2747       0.0385        3.2687  1.6672
     12        3.2743       0.0385        3.2683  2.0048
     13        3.2740       0.0385        3.2681  2.5144
     14        3.2739       0.0385        3.2679  2.3003
     15        3.2738       0.0385        3.2678  2.2463
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m4.4944[0m       [32m0.0880[0m        [35m2.9283[0m  1.4967
      2        [36m2.8409[0m       [32m0.0885[0m        [35m2.8226[0m  1.4041
      3        [36m2.8037[0m       [32m0.08

     12        0.8284       0.7168        0.8693  2.7572
     13        0.8632       0.7385        0.8691  3.0915
     14        1.0649       0.6928        1.5053  3.2747
     15        1.3005       0.6587        1.0490  2.9018
     16        1.1185       0.6322        1.1240  3.1351
     17        1.1082       0.6731        0.9675  3.0382
     18        0.9578       0.7188        0.9601  3.2291
     19        0.9077       0.7322        1.1337  3.0073
     20        0.9778       0.7317        1.0410  3.1186
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.5149[0m       [32m0.3625[0m        [35m1.9569[0m  1.7259
      2        [36m1.7500[0m       0.3197        2.1420  1.7117
      3        1.8022       [32m0.4159[0m        [35m1.6262[0m  1.9702
      4        [36m1.6215[0m       [32m0.4351[0m        [35m1.5834[0m  1.7381
      5        [36m1.5675[0m       0.4341        1.5911  1.754

     17        [36m0.3233[0m       [32m0.8788[0m        [35m0.3969[0m  1.3577
     18        [36m0.3157[0m       [32m0.8803[0m        [35m0.3911[0m  1.3862
     19        [36m0.3089[0m       [32m0.8827[0m        [35m0.3856[0m  1.4083
     20        [36m0.3029[0m       [32m0.8832[0m        [35m0.3807[0m  1.3480
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.6456[0m       [32m0.0803[0m        [35m3.2380[0m  1.2495
      2        [36m2.8580[0m       [32m0.3120[0m        [35m2.3858[0m  1.4098
      3        [36m1.9426[0m       [32m0.4558[0m        [35m1.5816[0m  1.3775
      4        [36m1.4041[0m       [32m0.5471[0m        [35m1.2737[0m  1.4610
      5        [36m1.1694[0m       [32m0.6341[0m        [35m1.1232[0m  1.3361
      6        [36m1.0373[0m       [32m0.6591[0m        [35m1.0220[0m  1.3113
      7        [36m0.9439[0m       [32m0.67

     18        2.0894       0.2524        2.0684  3.0586
     19        2.0950       0.2279        2.1491  3.0663
     20        2.0811       0.2466        2.1517  2.9913
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m4.9256[0m       [32m0.1125[0m        [35m2.8326[0m  1.6999
      2        [36m2.6616[0m       [32m0.1327[0m        [35m2.6793[0m  1.5708
      3        [36m2.6338[0m       [32m0.1375[0m        [35m2.5973[0m  1.4455
      4        [36m2.5970[0m       [32m0.1442[0m        2.6113  1.6433
      5        [36m2.5819[0m       0.1413        2.6076  1.5344
      6        [36m2.5685[0m       0.1279        [35m2.5662[0m  1.5506
      7        [36m2.5639[0m       0.1293        2.5816  1.5688
      8        2.5667       0.1226        2.6305  1.6480
      9        [36m2.5596[0m       [32m0.1471[0m        2.5990  1.6499
     10        [36m2.5569[0m       0.1346      

     14        [36m0.7208[0m       [32m0.6721[0m        [35m0.8641[0m  1.2652
     15        [36m0.7075[0m       [32m0.6745[0m        [35m0.8514[0m  1.3454
     16        [36m0.6959[0m       [32m0.6779[0m        [35m0.8406[0m  1.3303
     17        [36m0.6856[0m       [32m0.6880[0m        [35m0.8321[0m  1.3177
     18        [36m0.6765[0m       [32m0.6913[0m        [35m0.8247[0m  1.2393
     19        [36m0.6681[0m       [32m0.6952[0m        [35m0.8190[0m  1.4055
     20        [36m0.6606[0m       [32m0.6971[0m        [35m0.8146[0m  1.3161
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.7187[0m       [32m0.1135[0m        [35m2.8457[0m  1.4022
      2        [36m2.7921[0m       [32m0.1524[0m        2.8529  1.4622
      3        [36m2.6686[0m       [32m0.2005[0m        [35m2.6142[0m  1.5270
      4        [36m2.6622[0m       0.1784        [35

      3        [36m2.0130[0m       [32m0.3500[0m        [35m2.0140[0m  1.3442
      4        [36m1.3484[0m       [32m0.5591[0m        [35m1.2501[0m  1.3347
      5        [36m0.8931[0m       [32m0.7308[0m        [35m0.8693[0m  1.2728
      6        [36m0.6718[0m       [32m0.7957[0m        [35m0.6839[0m  1.4259
      7        [36m0.5562[0m       [32m0.8207[0m        [35m0.5826[0m  1.3857
      8        [36m0.4881[0m       [32m0.8385[0m        [35m0.5212[0m  1.3556
      9        [36m0.4436[0m       [32m0.8447[0m        [35m0.4805[0m  1.2764
     10        [36m0.4121[0m       [32m0.8538[0m        [35m0.4518[0m  1.3822
     11        [36m0.3884[0m       [32m0.8630[0m        [35m0.4301[0m  1.3883
     12        [36m0.3697[0m       [32m0.8697[0m        [35m0.4134[0m  1.2391
     13        [36m0.3544[0m       [32m0.8736[0m        [35m0.3999[0m  1.5138
     14        [36m0.3417[0m       [32m0.8779[0m        [35m0.3888[0m 

      3        [36m3.2805[0m       0.0385        [35m3.2716[0m  1.5497
      4        [36m3.2773[0m       0.0385        [35m3.2695[0m  1.4555
      5        [36m3.2759[0m       0.0385        [35m3.2684[0m  1.4148
      6        [36m3.2751[0m       0.0385        [35m3.2678[0m  1.4860
      7        [36m3.2747[0m       0.0385        [35m3.2674[0m  1.4812
      8        [36m3.2744[0m       0.0385        [35m3.2672[0m  1.4307
      9        [36m3.2742[0m       0.0385        [35m3.2670[0m  1.6399
     10        [36m3.2741[0m       0.0385        [35m3.2669[0m  1.5485
     11        [36m3.2740[0m       0.0385        [35m3.2668[0m  1.4949
     12        [36m3.2740[0m       0.0385        [35m3.2668[0m  2.2313
     13        [36m3.2739[0m       0.0385        [35m3.2668[0m  2.8619
     14        [36m3.2739[0m       0.0385        [35m3.2668[0m  3.1059
     15        [36m3.2739[0m       0.0385        [35m3.2667[0m  2.7649
     16        [36m3.273

      3        [36m1.7157[0m       [32m0.3591[0m        [35m1.8240[0m  1.3844
      4        [36m1.3434[0m       [32m0.4591[0m        [35m1.5046[0m  1.4005
      5        [36m1.1394[0m       [32m0.5346[0m        [35m1.2991[0m  1.3182
      6        [36m1.0048[0m       [32m0.5740[0m        [35m1.1538[0m  1.4973
      7        [36m0.9156[0m       [32m0.6034[0m        [35m1.0547[0m  1.3818
      8        [36m0.8553[0m       [32m0.6231[0m        [35m0.9854[0m  1.3466
      9        [36m0.8125[0m       [32m0.6418[0m        [35m0.9362[0m  1.4185
     10        [36m0.7806[0m       [32m0.6548[0m        [35m0.9005[0m  1.3545
     11        [36m0.7560[0m       [32m0.6649[0m        [35m0.8736[0m  1.4417
     12        [36m0.7363[0m       [32m0.6774[0m        [35m0.8531[0m  1.3541
     13        [36m0.7201[0m       [32m0.6861[0m        [35m0.8373[0m  1.2621
     14        [36m0.7065[0m       [32m0.6894[0m        [35m0.8244[0m 

In [42]:
## Check the best results:
print('Best parameters given by GridSearchCV are:', grid_best_results.best_params_)

Best parameters given by GridSearchCV are: {'clf__max_epochs': 20, 'clf__module__nonlin': <function tanh at 0x000001A4B1031EE0>, 'clf__module__num_units': 200, 'clf__optimizer': <class 'torch.optim.sgd.SGD'>, 'vect__ngram_range': (1, 1)}


In [43]:
# show results of optimized vectorizer and net on test set

optimized_pipe = grid_best_results.best_estimator_
optimized_pipe.score(newdataframe_sub_testing['text'], y_test_sub)

## 0.9126923076923077 DONE ###

0.9126923076923077