# CHALLENGE 4

Steps done: 
- Load the dataset
- NLP Processing: Tokenizing, Cleaning, Normalization
- Transform the text to tf-idf features
- Train a baseline model (Logistic Regression)
- Make a first submission
- Try pre-trained model like Google Electra model and simpletransformers lib

Todo list:
- Try to augment the data with this algo: https://github.com/jasonwei20/eda_nlp

Check if we are using the GPU

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
#initialize seed to be sure that the results can be reproduced
import numpy as np
seed = 7
np.random.seed(seed)

# DATASET CLASS

In [None]:
#Load packages
import os, sys
import numpy as np
import pandas as pd

class Dataset:
    """Class for loading the dataset"""
    def __init__(self):
        self._currPath = os.path.dirname(os.path.abspath("__file__")) #root path of the full the project
    
    def LoadX(self):
        """Load the inputs"""
        self._listComments = pd.read_csv(self._currPath+"/train.txt", header=None)
        commentsNp = np.array(self._listComments)
        self._arrayComments = commentsNp
    
    def LoadY(self):
        """Load the outputs"""
        self._listLabels = pd.read_csv(self._currPath+"/train_labels.txt", header=None)
        labelsNp = np.array(self._listLabels)
        self._arrayLabels = labelsNp
      
    def LoadKaggleTest(self):
        """Load the Kaggle test set"""
        self._listCommentsTest = pd.read_csv(self._currPath+"/test.txt", header=None)
        commentsTestNp = np.array(self._listCommentsTest)
        self._arrayCommentsTest = commentsTestNp
    
    def GetListComments(self):
        """Get the list of inputs"""
        return self._listComments
    
    def GetArrayComments(self):
        """Get the array of inputs"""
        return self._arrayComments
    
    def GetListLabels(self):
        """Get the list of outputs"""
        return self._listLabels
    
    def GetArrayLabels(self):
        """Get the array of outputs"""
        return self._arrayLabels
    
    def GetListCommentsTest(self):
        """Get the Kaggle test set as a list"""
        return self._listCommentsTest

In [None]:
def printList(li):
    """Display the list"""
    print(len(li))
    for i in range(len(li)):
        print(li[0][i])

In [None]:
dataset = Dataset()
dataset.LoadX()
dataset.LoadY()
dataset.LoadKaggleTest()
#printList(dataset.GetListComments())
#printList(dataset.GetListLabels())

# MLTRAINING MODELS

##Electra

Transform the labels into integers, reshape the data so we can use simpletransformers and split them into train, test sets

In [30]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

encoder = preprocessing.LabelEncoder()
X = dataset.GetArrayComments()
Y = encoder.fit_transform(dataset.GetArrayLabels())
Y = np.reshape(Y,(X.shape[0],1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.9, random_state=42)

  y = column_or_1d(y, warn=True)


Merge X_train, y_train and X_test, y_test as simpletransformers take dataframe inputs with X and Y combined

In [31]:
train_data = np.concatenate((X_train, y_train), axis=1)
eval_data = np.concatenate((X_test, y_test), axis=1)
train_df = pd.DataFrame(train_data)
eval_df = pd.DataFrame(eval_data)

In [None]:
pip install simpletransformers

Finally train Electra model using simpletransformers lib

In [None]:

from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = ClassificationArgs()
model_args.reprocess_input_data: True
model_args.num_train_epochs = 1
model_args.learning_rate = 1e-5
model_args.overwrite_output_dir =  True
model_args.max_seq_length: 128
model_args.vocab_size: 52000
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "mae"
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 500

# Create a ClassificationModel
model = ClassificationModel('electra', 'google/electra-base-discriminator', args=model_args) 

# Train the model
model.train_model(train_df)


Eval the model and print the results

In [26]:
from sklearn.metrics import mean_absolute_error
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=mean_absolute_error)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1750.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=219.0), HTML(value='')))

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8299750855531114, 'tp': 797, 'tn': 804, 'fp': 85, 'fn': 64, 'acc': 0.08514285714285715, 'eval_loss': 0.2658670340839042}





In [28]:
from sklearn.metrics import classification_report

print(model_outputs)
predictionsTest = [x.index(max(x)) for x in model_outputs.tolist()]
print(predictionsTest)
print(y_test.shape)
score = mean_absolute_error(y_test, predictionsTest)
print(score)
print(classification_report(y_test,predictionsTest))

[[ 2.48828125 -3.02148438]
 [ 2.41796875 -2.94140625]
 [-0.48608398  0.54443359]
 ...
 [ 2.25195312 -2.81835938]
 [ 2.68164062 -3.29296875]
 [-0.73291016  0.82519531]]
[0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 

Predict on the Kaggle test set

In [34]:
testKaggle = dataset.GetListCommentsTest()
testKaggle = np.array(testKaggle)
testKaggle = np.reshape(testKaggle,testKaggle.shape[0])
print(testKaggle)
predictions, raw_outputs = model.predict(testKaggle)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


['one reason pixar has endured so well  and been so successful  is that while their films remain technical marvels and visual mosaics  they have a story to match their style . and often very moving style at that affecting  charming and cross  generational . that a lot anime  speaking in broad terms  and a great many other animations fail to match their technical virtuosity with real substance is  i think  and i might be wrong  partly because either the makers aren  t bothered with character and plot and focus far too much on sound and image  or the sheer effort that goes into making some animations is so enormous  so enervating that they don  t have the energy to create a really engaging story .  br    br   that same cannot be said of renaissance . there are flaws in its plot  but i  ll get to that later . those same flaws  however  are not reflected in the visuals  renaissance is nowt short of stunning . the ultra  high contrast images  sometimes so high  contrast that is nothing but 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7501.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=938.0), HTML(value='')))




In [35]:
print(predictions.shape)
np.savetxt("preds7.csv", predictions, delimiter=",",fmt='%s')

(7501,)
