# CHALLENGE 4

Steps done: 
- Load the dataset
- NLP Processing: Tokenizing, Cleaning, Normalization
- Transform the text to tf-idf features
- Train a baseline model (Logistic Regression)
- Make a first submission

Todo list:
- Try pre-trained model like Google Electra model and simpletransformers lib
- Try to augment the data with this algo: https://github.com/jasonwei20/eda_nlp

Check if we are using the GPU

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

''

In [None]:
#initialize seed to be sure that the results can be reproduced
import numpy as np
seed = 7
np.random.seed(seed)

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


# DATASET CLASS

In [None]:
#Load packages
import os, sys
import numpy as np
import pandas as pd

class Dataset:
    """Class for loading the dataset"""
    def __init__(self):
        self._currPath = os.path.dirname(os.path.abspath("__file__")) #root path of the full the project
    
    def LoadX(self):
        """Load the inputs"""
        self._listComments = pd.read_csv(self._currPath+"/train.txt", header=None)
        commentsNp = np.array(self._listComments)
        self._arrayComments = commentsNp
    
    def LoadY(self):
        """Load the outputs"""
        self._listLabels = pd.read_csv(self._currPath+"/train_labels.txt", header=None)
        labelsNp = np.array(self._listLabels)
        self._arrayLabels = labelsNp
      
    def LoadKaggleTest(self):
        """Load the Kaggle test set"""
        self._listCommentsTest = pd.read_csv(self._currPath+"/test.txt", header=None)
        commentsTestNp = np.array(self._listCommentsTest)
        self._arrayCommentsTest = commentsTestNp
    
    def GetListComments(self):
        """Get the list of inputs"""
        return self._listComments
    
    def GetArrayComments(self):
        """Get the array of inputs"""
        return self._arrayComments
    
    def GetListLabels(self):
        """Get the list of outputs"""
        return self._listLabels
    
    def GetArrayLabels(self):
        """Get the array of outputs"""
        return self._arrayLabels
    
    def GetListCommentsTest(self):
        """Get the Kaggle test set as a list"""
        return self._listCommentsTest

In [None]:
def printList(li):
    """Display the list"""
    print(len(li))
    for i in range(len(li)):
        print(li[0][i])

In [None]:
dataset = Dataset()
dataset.LoadX()
dataset.LoadY()
dataset.LoadKaggleTest()
#printList(dataset.GetListComments())
#printList(dataset.GetListLabels())

# NLP PREPROCESSING
Tokenizing the Text: Separate each word from sentences

Cleaning Text Data: Removing Stopwords, Lexicon 

Normalization: a way of processing words that reduces them to their roots

Tokenizing

In [None]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

Cleaning: Remove spaces and converts text into lowercase

In [None]:
from sklearn.base import TransformerMixin
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

Apply functions to our dataset

In [None]:
my_doc = dataset.GetListComments()

clean_txt = []
for sentence in my_doc[0]:
  clean_txt.append(clean_text(sentence))
print(clean_txt[0],clean_txt[1])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned 

In [None]:
# Create list of word tokens
token_list = []
for sentence in clean_txt:
  token_list.append(spacy_tokenizer(sentence))
print(token_list[0],token_list[1])

['bromwell', 'high', 'cartoon', 'comedy', 'ran', 'time', 'programs', 'school', 'life', 'teachers', 'years', 'teaching', 'profession', 'lead', 'believe', 'bromwell', 'high', 's', 'satire', 'closer', 'reality', 'teachers', 'scramble', 'survive', 'financially', 'insightful', 'students', 'right', 'pathetic', 'teachers', 'pomp', 'pettiness', 'situation', 'remind', 'schools', 'knew', 'students', 'saw', 'episode', 'student', 'repeatedly', 'tried', 'burn', 'school', 'immediately', 'recalled', 'high', 'classic', 'line', 'inspector', 'm', 'sack', 'teachers', 'student', 'welcome', 'bromwell', 'high', 'expect', 'adults', 'age', 'think', 'bromwell', 'high', 'far', 'fetched', 'pity', 'isn', 't'] ['story', 'man', 'unnatural', 'feelings', 'pig', 'starts', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', 'violent', 'mob', 'crazy', 'chantings', 's', 'singers', 'unfortunately', 'stays', 'absurd', 'time', 'general', 'narrative', 'eventua

In [None]:
X = np.array([', '.join(x) for x in token_list])

# TF-IDF

TFIDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

encoder = preprocessing.LabelEncoder()
Y = encoder.fit_transform(dataset.GetArrayLabels())

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

print(X_train.shape)

from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
)

X_train = pipe.fit_transform(X_train.ravel()) 
X_test = pipe.transform(X_test)
X_testKaggle = pipe.transform(dataset.GetListCommentsTest()[0])


  y = column_or_1d(y, warn=True)


(15750,)


# MLTRAINING MODELS

##Baseline: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report
from collections import Counter

def LR():
    """Train logistic regression model"""
    print(sorted(Counter(y_train).items()))

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test,y_pred))
    score = mean_absolute_error(y_test, y_pred)
    print(score)
    return classifier,y_pred

In [None]:
logreg, ypred = LR()

[(0, 7866), (1, 7884)]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       884
           1       0.89      0.89      0.89       866

    accuracy                           0.89      1750
   macro avg       0.89      0.89      0.89      1750
weighted avg       0.89      0.89      0.89      1750

0.112


In [None]:
preds = logreg.predict(X_testKaggle)
#print(preds)
np.savetxt("preds.csv", preds, delimiter=",",fmt='%s')