In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import urllib
import string
import re
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB # Doesn't work for Word2Vec because of negative values in word vectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV

from gensim.models import Word2Vec, Phrases

from tqdm import tqdm
tqdm.pandas()

# Logging to display info regarding training of models especially Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [3]:
train = pd.read_csv('dataset/train_data.csv')
# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

# Convert webpageDescription from string to JSON
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: json.loads(x))

### Filling in webpageDescription

    Use the body key value if non-empty
    Else use the title key vallue
    Else use the url key value
    Else just fill it with 'unknown'

In [3]:
def use_body_key(x):
    # strip() function is used to ensure that only blank descriptions don't pass through this condition
    if x['body'] == None or len(x['body'].strip()) == 0:
        if x['title'] == None or len(x['title'].strip()) == 0:
            if x['url'] == None or len(x['url'].strip()) == 0:
                return 'unknown'
            return x['url']
        return x['title']
    
    return x['body']

merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_body_key(x))
print(merged_data['webpageDescription'].isna().sum())

0


In [4]:
def preparing_data_for_training(dataset, random_state=42):
    '''
        Takes in the dataset as input which is the output of the preprocessing() function call
        Applies get_dummies on the categorical columns
        Removes webpageDescription & id from the data because they are not required for training
        Applies train_test_split with test_size = 0.3
        Applies StandardScaler by fitting on X_train and transforming both X_train & X_test
        
        Returns
        -----------------------------
        X_train, X_test, y_train, y_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    
    X = train_data.drop(['label', 'id'], axis=1)
    y = train_data['label']
        
    return train_test_split(X, y, test_size=0.3, random_state=random_state)

def preparing_data_for_final_submission(dataset):        
    '''
        Apply get_dummies and feature standardization on the entire data (train.csv + test.csv)
        Separates out train.csv and test.csv data from this processed data
        Returns X_train (that has been processed from train.csv), y_train (from train.csv) & X_test (that has been processed from test.csv)
        
        Returns
        ----------------
        X_train, y_train, X_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    test_data = dataset[dataset['label'].isna() == True]
    
    X_train = train_data.drop(['label', 'id'], axis=1)
    y_train = train_data['label']
    
    # Do not drop "id" from X_test
    X_test = test_data.drop(['label'], axis=1)
    
    return X_train, y_train, X_test

def generate_csv_submission(test, y_final_pred, output_file_name='submission.csv'):
    '''
        Parameters
        -----------------------
        test: Test data that contains id column
        
        y_final_pred: predict_proba() output for given model and test data
        
        output_file_name: Name of submission output file
    '''
    submission_df = pd.DataFrame()
    submission_df["id"] = test["id"]
    submission_df["label"] = y_final_pred
    submission_df.to_csv(output_file_name, index=False)

word2vec requires a single sentence as input and a sentence is treated as a list of words, so this function returns a list of words

Removing stopwords and numbers can be detrimental to the learning process, so they're not removed here

In [5]:
def preprocess_webpage_description(description, remove_stopwords=False, no_empty_lists=False):
    # Function to convert a raw webpage description to a string of words
    # The input is a single string (webpage description), and 
    # the output is a single string (a preprocessed webpage description)

    # 1. Remove HTML
    words = BeautifulSoup(description).get_text() 

    # 2. Remove non-alphanumeric values
    words = re.sub("[^a-zA-Z\d]", " ", words) 

    # 3. Convert to lower case, split into individual words
    words = words.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    if remove_stopwords:
        stops = set(stopwords.words("english"))                  
        words = [w for w in words if not w in stops]   
    
    return words

## Approach 1: Averaging word vectors to get feature vector

Reference: https://www.kaggle.com/c/word2vec-nlp-tutorial/overview/part-2-word-vectors

Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists.

It is not at all straightforward how to split a paragraph into sentences. There are all kinds of gotchas in natural language. English sentences can end with "?", "!", """, or ".", among other things, and spacing and capitalization are not reliable guides either. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting.

In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
# Define a function to split a description into parsed sentences
def description_to_sentences(description, tokenizer, remove_stopwords=False):
    # Function to split a description into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(description.strip())
    
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call preprocess_webpage_description to get a list of words
            sentences.append(preprocess_webpage_description(raw_sentence, remove_stopwords))
    
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    
    return sentences

Apply this function on the entire dataset to convert each description into a list of descriptions, i.e. list of list of sentences, where all the sentences are split into words but all of the words are combined into one list which is why we use += in the below for loop instead append() method

For example,

    A = [[1,2],[2,3]]
    B = [[3,4],[4,5]]
    A.append(B) = [[1,2],[2,3],[[3,4],[4,5]]] => Not what we want
    A += B => A = [[1,2],[2,3],[3,4],[4,5]] => What we want

This only happens when you're trying to join 2D lists or even higher dimensional lists

# Skip these sections if Approach 1 model is already trained

In [125]:
# sentences = []  # Initialize an empty list of sentences

# for desc in tqdm(merged_data['webpageDescription']):
#    sentences += description_to_sentences(desc, tokenizer)


100%|█████████████████████████████████████| 7394/7394 [00:05<00:00, 1404.70it/s]


In [126]:
print(sentences[0])

['polyvore', 'is', 'the', 'best', 'place', 'to', 'discover', 'or', 'start', 'fashion', 'trends']


### Training Word2Vec model

References for understanding the various parameters of the model:

- https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca
- https://jalammar.github.io/illustrated-word2vec/


    Architecture: Architecture options are skip-gram (default) or continuous bag of words. skip-gram
    typically produces better results.
    FIND OUT WHAT WORKS BEST
    
    Training algorithm: Hierarchical softmax (default) or negative sampling
    FIND OUT WHAT WORKS BEST
    
    Downsampling of frequent words: The Google documentation recommends values between .00001 and .001
    FIND OUT WHAT WORKS BEST
    
    Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better 
    models. Reasonable values can be in the tens to hundreds; we used 300
    
    Context / window size: How many words of context should the training algorithm take into account? 10 seems to 
    work well for hierarchical softmax (more is better, up to a point).
    
    Worker threads: Number of parallel processes to run. 
    This is computer-specific, but between 4 and 6 should work on most systems.
    
    Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not 
    occur at least this many times across all documents is ignored. 
    Reasonable values could be between 10 and 100. 

Choosing parameters is not easy, but once we have chosen our parameters, creating a Word2Vec model is straightforward:

In [127]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# w2v_model = Word2Vec(sentences, window=context, workers=num_workers, vector_size=num_features,
#                     min_count=min_word_count, sample=downsampling, sg=1)

# model_name = "300features_1minwords_10context_sg"
# w2v_model.save(model_name)

2021-12-10 01:19:09,352 : INFO : collecting all words and their counts
2021-12-10 01:19:09,354 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-12-10 01:19:09,874 : INFO : PROGRESS: at sentence #10000, processed 2109284 words, keeping 65616 word types
2021-12-10 01:19:10,148 : INFO : collected 85239 word types from a corpus of 3349470 raw words and 15767 sentences
2021-12-10 01:19:10,150 : INFO : Creating a fresh vocabulary
2021-12-10 01:19:10,504 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 85239 unique words (100.0%% of original 85239, drops 0)', 'datetime': '2021-12-10T01:19:10.504407', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
2021-12-10 01:19:10,505 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 3349470 word corpus (100.0%% of original 3349470, drops 0)', 'datetime': '2

2021-12-10 01:19:57,651 : INFO : EPOCH 3 - PROGRESS: at 14.49% examples, 129059 words/s, in_qsize 8, out_qsize 0
2021-12-10 01:19:58,731 : INFO : EPOCH 3 - PROGRESS: at 18.74% examples, 128165 words/s, in_qsize 8, out_qsize 0
2021-12-10 01:19:59,792 : INFO : EPOCH 3 - PROGRESS: at 24.06% examples, 131163 words/s, in_qsize 8, out_qsize 0
2021-12-10 01:20:00,798 : INFO : EPOCH 3 - PROGRESS: at 29.23% examples, 132657 words/s, in_qsize 7, out_qsize 0
2021-12-10 01:20:01,878 : INFO : EPOCH 3 - PROGRESS: at 34.63% examples, 133405 words/s, in_qsize 8, out_qsize 0
2021-12-10 01:20:02,910 : INFO : EPOCH 3 - PROGRESS: at 40.92% examples, 134660 words/s, in_qsize 7, out_qsize 0
2021-12-10 01:20:03,971 : INFO : EPOCH 3 - PROGRESS: at 46.75% examples, 135124 words/s, in_qsize 7, out_qsize 0
2021-12-10 01:20:04,997 : INFO : EPOCH 3 - PROGRESS: at 51.47% examples, 134455 words/s, in_qsize 7, out_qsize 0
2021-12-10 01:20:06,003 : INFO : EPOCH 3 - PROGRESS: at 57.12% examples, 135752 words/s, in_qsiz

2021-12-10 01:20:56,175 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=85239, vector_size=300, alpha=0.025)', 'datetime': '2021-12-10T01:20:56.175768', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-12-10 01:20:56,177 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_1minwords_10context_sg', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-12-10T01:20:56.177084', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'saving'}
2021-12-10 01:20:56,178 : INFO : storing np array 'vectors' to 300features_1minwords_10context_sg.wv.vectors.npy
2021-12-10 01:20:56,246 : INFO : storing np array 'syn1neg' to 300features_1minwords_10context_sg.syn1neg.npy
2021-12-10 01:20:56,295 : INFO : not storing attribu

# If Approach 1 model already trained, start from here

### Using Trained Word2Vec model to convert input descriptions into feature vectors

In [7]:
w2v_model = Word2Vec.load('300features_1minwords_10context_sg')

2021-12-10 11:37:45,482 : INFO : loading Word2Vec object from 300features_1minwords_10context_sg
2021-12-10 11:37:45,509 : INFO : loading wv recursively from 300features_1minwords_10context_sg.wv.* with mmap=None
2021-12-10 11:37:45,510 : INFO : loading vectors from 300features_1minwords_10context_sg.wv.vectors.npy with mmap=None
2021-12-10 11:37:45,579 : INFO : loading syn1neg from 300features_1minwords_10context_sg.syn1neg.npy with mmap=None
2021-12-10 11:37:45,649 : INFO : setting ignored attribute cum_table to None
2021-12-10 11:37:46,275 : INFO : Word2Vec lifecycle event {'fname': '300features_1minwords_10context_sg', 'datetime': '2021-12-10T11:37:46.275646', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'loaded'}


### Tokenize webpageDescription

So that its word vectors can be individually accessed

In [8]:
processed_data = merged_data.copy(deep=True)
processed_data['tokenizedDescription'] = processed_data['webpageDescription'].progress_apply(lambda x: preprocess_webpage_description(x, no_empty_lists=True))

100%|█████████████████████████████████████| 7394/7394 [00:01<00:00, 5635.60it/s]


### Convert word vectors into feature vector by averaging technique

For a given webpage description, its feature vector is the average of word vectors of all the words in that description

In [6]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    
    # Pre-initialize an empty numpy array (for speed)
    # This array will contain the sum of all word vectors for the given description
    featureVec = np.zeros((num_features,),dtype="float32")
    
    # This counts the number of words from given description whose word vectors are used
    # to compute the overall word embedding for this description
    nwords = 0.

    # index_to_key is a list that contains the names of the words in the model's vocabulary
    # Convert it to a set, for speed 
    index2word_set = set(model.wv.index_to_key)

    # There are outlier cases where after doing the preprocessing steps, i.e. after removing non-alphanumeric
    # characters there are no words left, so sentence remains an empty list which is a problem for training the
    # word2vec model, so we just return a list containing 'unknown' as the sole word
    # Note: This typically happens with a few entries where description contains only Japanese characters and such
    if len(words) == 0:
        words = ['unknown']
    
    # Loop over each word in the description and if it is in the model's vocabulary,
    # add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            # Add the word vector of given word in featureVec
            featureVec = np.add(featureVec, model.wv[word])
    
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec, nwords)
    
    return featureVec


def getAvgFeatureVecs(descriptions, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
     
    # Preallocate a 2D numpy array, for speed
    descriptionFeatureVecs = np.zeros((len(descriptions),num_features),dtype="float32")
    
    # Loop through the reviews
    for i, description in enumerate(tqdm(descriptions)):
         
        # Call the function (defined above) that makes average feature vectors
        descriptionFeatureVecs[i] = makeFeatureVec(description, model, num_features)
        
    return descriptionFeatureVecs

In [10]:
# num_features = Same as that used for training
vectorized_data = pd.DataFrame(getAvgFeatureVecs(processed_data['tokenizedDescription'], w2v_model, num_features=300))

100%|██████████████████████████████████████| 7394/7394 [00:47<00:00, 154.05it/s]


### Concatenate label and id with vectorized data

So that predictions can be made with labelled data

In [11]:
modelling_data = pd.concat([processed_data[['label','id']], vectorized_data], axis=1)

In [13]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

0.8647814792525272


In [15]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

lr_model = LogisticRegression()
lr_model.fit(X_train_final, y_train_final)

y_final_pred = lr_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_lr_300features_1minwords_10context_sg.csv')

### Kaggle Score of 0.87958

## Approach 2: Using Bigrams and Trigrams and then Averaging

Reference: https://www.kaggle.com/alexcherniuk/imdb-review-word2vec-bilstm-99-acc

We give Trigram sentences as input to the Word2Vec model

For this we first fit Phrases identifier on the tokenized web description data which will give us Bigrams and then send this output to another Phrases identifier to give us Trigrams

We then use,

    bigrams[tokenized_description_data]
to give us output where all the bigrams are identified and then we send this to trigrams to give us output with all the trigrams identified

We then use the Trigram identified sentences as input to the Word2Vec model

    trigrams[bigrams[tokenized_description_data]]
    

In [7]:
processed_data = merged_data.copy(deep=True)
tokenized_description_data = processed_data['webpageDescription'].progress_apply(lambda x: preprocess_webpage_description(x))

100%|█████████████████████████████████████| 7394/7394 [00:02<00:00, 3009.56it/s]


# Skip these cells if Approach 2 model already trained

In [17]:
bigrams = Phrases(sentences=tokenized_description_data)

2021-12-10 12:04:40,779 : INFO : collecting all words and their counts
2021-12-10 12:04:40,782 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-12-10 12:04:45,725 : INFO : collected 1126483 token types (unigram + bigrams) from a corpus of 3349471 words and 7394 sentences
2021-12-10 12:04:45,726 : INFO : merged Phrases<1126483 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-12-10 12:04:45,727 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1126483 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 4.95s', 'datetime': '2021-12-10T12:04:45.727713', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'created'}


Example of how the Phrases structure works

Bigrams are marked with _ to connect the two words in the bigram

Similarly for trigrams and so on

In [46]:
bigrams["space station near the solar system".split()]

['space_station', 'near', 'the', 'solar_system']

In [18]:
trigrams = Phrases(sentences=bigrams[tokenized_description_data])

2021-12-10 12:04:50,341 : INFO : collecting all words and their counts
2021-12-10 12:04:50,344 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-12-10 12:05:00,912 : INFO : collected 1277504 token types (unigram + bigrams) from a corpus of 2988442 words and 7394 sentences
2021-12-10 12:05:00,913 : INFO : merged Phrases<1277504 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-12-10 12:05:00,914 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1277504 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 10.57s', 'datetime': '2021-12-10T12:05:00.914155', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'created'}


In [56]:
num_features = 300
context = 10

# trigrams_w2v_model = Word2Vec(
#     sentences = trigrams[bigrams[tokenized_description_data]],
#     vector_size=num_features,
#     min_count=1, window=10, workers=4,
#     sg=1
# )

# model_name = 'trigrams_300features_1minwords_10context_sg'

# trigrams_w2v_model.save(model_name)


2021-12-10 12:31:19,014 : INFO : collecting all words and their counts
2021-12-10 12:31:19,018 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-12-10 12:31:31,731 : INFO : collected 105256 word types from a corpus of 2817928 raw words and 7394 sentences
2021-12-10 12:31:31,732 : INFO : Creating a fresh vocabulary
2021-12-10 12:31:31,987 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 105256 unique words (100.0%% of original 105256, drops 0)', 'datetime': '2021-12-10T12:31:31.987090', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
2021-12-10 12:31:31,987 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 2817928 word corpus (100.0%% of original 2817928, drops 0)', 'datetime': '2021-12-10T12:31:31.987939', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]

2021-12-10 12:32:19,183 : INFO : EPOCH 3 - PROGRESS: at 25.51% examples, 111632 words/s, in_qsize 7, out_qsize 0
2021-12-10 12:32:20,192 : INFO : EPOCH 3 - PROGRESS: at 30.31% examples, 111053 words/s, in_qsize 7, out_qsize 0
2021-12-10 12:32:21,222 : INFO : EPOCH 3 - PROGRESS: at 35.20% examples, 111656 words/s, in_qsize 7, out_qsize 0
2021-12-10 12:32:22,223 : INFO : EPOCH 3 - PROGRESS: at 41.26% examples, 112529 words/s, in_qsize 8, out_qsize 0
2021-12-10 12:32:23,224 : INFO : EPOCH 3 - PROGRESS: at 46.21% examples, 112268 words/s, in_qsize 8, out_qsize 0
2021-12-10 12:32:24,234 : INFO : EPOCH 3 - PROGRESS: at 51.24% examples, 112871 words/s, in_qsize 7, out_qsize 0
2021-12-10 12:32:25,336 : INFO : EPOCH 3 - PROGRESS: at 56.32% examples, 112410 words/s, in_qsize 7, out_qsize 1
2021-12-10 12:32:26,349 : INFO : EPOCH 3 - PROGRESS: at 61.18% examples, 112920 words/s, in_qsize 7, out_qsize 0
2021-12-10 12:32:27,419 : INFO : EPOCH 3 - PROGRESS: at 66.74% examples, 112884 words/s, in_qsiz

2021-12-10 12:33:14,549 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'trigrams_300features_1minwords_10context_sg', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-12-10T12:33:14.549851', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'saving'}
2021-12-10 12:33:14,551 : INFO : storing np array 'vectors' to trigrams_300features_1minwords_10context_sg.wv.vectors.npy
2021-12-10 12:33:14,609 : INFO : storing np array 'syn1neg' to trigrams_300features_1minwords_10context_sg.syn1neg.npy
2021-12-10 12:33:14,663 : INFO : not storing attribute cum_table
2021-12-10 12:33:14,703 : INFO : saved trigrams_300features_1minwords_10context_sg


# Start from these cells if Approach 2 model already trained

In [9]:
trigrams_w2v_model = Word2Vec.load('trigrams_300features_1minwords_10context_sg')

2021-12-11 19:47:29,128 : INFO : loading Word2Vec object from trigrams_300features_1minwords_10context_sg
2021-12-11 19:47:29,183 : INFO : loading wv recursively from trigrams_300features_1minwords_10context_sg.wv.* with mmap=None
2021-12-11 19:47:29,184 : INFO : loading vectors from trigrams_300features_1minwords_10context_sg.wv.vectors.npy with mmap=None
2021-12-11 19:47:29,224 : INFO : loading syn1neg from trigrams_300features_1minwords_10context_sg.syn1neg.npy with mmap=None
2021-12-11 19:47:29,260 : INFO : setting ignored attribute cum_table to None
2021-12-11 19:47:30,493 : INFO : Word2Vec lifecycle event {'fname': 'trigrams_300features_1minwords_10context_sg', 'datetime': '2021-12-11T19:47:30.493473', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-38-generic-x86_64-with-glibc2.29', 'event': 'loaded'}


In [10]:
print("Vocabulary size:", len(trigrams_w2v_model.wv.index_to_key))

Vocabulary size: 105256


In [11]:
trigrams_w2v_model.wv.most_similar("news")

[('breaking_news', 0.7118445038795471),
 ('local_news', 0.6580551266670227),
 ('world_news', 0.6528180837631226),
 ('news_online', 0.6391521692276001),
 ('daily_news', 0.6368198990821838),
 ('politics', 0.6356375217437744),
 ('dallas_fort_worth', 0.6308645009994507),
 ('uk_news', 0.629724383354187),
 ('cnn', 0.6218330264091492),
 ('headlines', 0.6150000095367432)]

In [12]:
trigrams_w2v_model.wv.doesnt_match(['steve_jobs', 'apple', 'banana', 'bill_gates'])

'banana'

In [13]:
vectorized_data = pd.DataFrame(getAvgFeatureVecs(tokenized_description_data, trigrams_w2v_model, num_features=300))

100%|██████████████████████████████████████| 7394/7394 [00:53<00:00, 139.36it/s]


In [14]:
modelling_data = pd.concat([processed_data[['label','id']], vectorized_data], axis=1)

In [24]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

0.8625265675825976


In [25]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

lr_model = LogisticRegression()
lr_model.fit(X_train_final, y_train_final)

y_final_pred = lr_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_lr_trigrams_300features_1minwords_10context_sg.csv')

### Kaggle Score of 0.88343

### MLP Classifier

Using hyperparameter tuning to try out different architectures as well as tune the parameters

In [28]:
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (100, 50, 50), (100, 50, 50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': np.linspace(0.0001, 0.05, num=5),
    'learning_rate': ['constant','adaptive']
}

mlp_grid_cv = GridSearchCV(
    MLPClassifier(),
    param_grid,
    scoring='roc_auc',
    n_jobs=-1,
    cv=5,
    verbose=2
)

In [29]:
# mlp_grid_cv.fit(X_train, y_train)

# print(mlp_grid_cv.best_params_)

# print("ROC AUC Score of Best MLP Classifier Hyperparameter Model:", roc_auc_score(y_test, mlp_grid_cv.best_estimator_.predict_proba(X_test)[:, 1]))

Fitting 5 folds for each of 400 candidates, totalling 2000 fits














[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  12.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  13.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.8s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=  11.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  16.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptiv

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  12.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=  12.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=sgd; total time=  11.5s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  11.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  15.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  15.5s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=a

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=  14.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   8.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  12.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  14.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  15.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  13.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=   5.5s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   7.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, s

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=  13.8s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=  11.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  15.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  16.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, so

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=  13.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=   8.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  13.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  11.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  14.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant,













[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=   6.9s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.1s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.5s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  12.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  11.8s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  15.8s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  16.0s
[CV] EN

[CV] END activation=tanh, alpha=0.03336666666666667, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  16.2s
[CV] END activation=tanh, alpha=0.03336666666666667, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=   4.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.6s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.3s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.0s
[CV

[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=  12.3s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  13.0s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   5.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   8.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=   9.8s
[CV] END activati

[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  11.6s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=constant, solver=sgd; total time=  11.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   8.0s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=   8.7s
[CV] END activation

[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  11.6s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.6s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.7s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  12.0s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  15.7s
[CV] END ac

[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  11.5s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam; total time=  11.3s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.4s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=constant, solver=sgd; total time=  11.8s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.2s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  13.7s
[CV] END activation=tanh, alpha=0.03891111111111112, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  15.9s
[CV] END activatio













{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
ROC AUC Score of Best Random Forest Hyperparameter Model: 0.8650614387250608




Best parameters:

    {
        'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50, 50, 50),
        'learning_rate': 'adaptive', 'solver': 'sgd'
    }

In [30]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

mlp_grid_cv.best_estimator_.fit(X_train_final, y_train_final)

y_final_pred = mlp_grid_cv.best_estimator_.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_mlp_trigrams_300features_1minwords_10context_sg.csv')



[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  13.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=sgd; total time=  11.9s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  12.2s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=adam; total time=  11.0s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  12.6s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  17.2s
[CV] END



[CV] END activation=relu, alpha=0.016733333333333333, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  12.2s
[CV] END activation=relu, alpha=0.016733333333333333, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  10.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam; total time=  11.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  12.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.0s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=  13.5s
[CV] E



[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=   9.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.3s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.9s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.1s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  11.5s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  14.9s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  11.8s




[CV] END activation=relu, alpha=0.016733333333333333, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  13.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  11.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.5s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  12.2s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  12.6s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=  13.7s
[CV] END activ



[CV] END activation=relu, alpha=0.016733333333333333, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  15.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=sgd; total time=  11.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  13.5s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.2s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  12.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.0s
[CV] END

[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.1s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam; total time=  13.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  14.3s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd; total time=  12.3s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  15.0s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=adam; total time=  15.0s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  13.7s
[CV] E



[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  12.6s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  13.1s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  12.9s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=  13.2s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.6s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  14.8s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  15.6s
[CV] END a



[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  11.5s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd; total time=  12.4s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam; total time=  13.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=  13.0s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=constant, solver=sgd; total time=  14.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  14.7s
[CV] END activation=relu, alpha=0.02227777777777778, hidden_layer_sizes=(100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  16.2s
[CV] EN



### Kaggle Score of 0.88678 (Best Submission yet)

### Random Forest

In [39]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

model = RandomForestClassifier(n_estimators=1200, min_samples_split=27, min_samples_leaf=3)

model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

0.8738385808276643


In [41]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

model = RandomForestClassifier()
model.fit(X_train_final, y_train_final)

y_final_pred = model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_rf_tuned_trigrams_300features_1minwords_10context_sg.csv')

### Kaggle Score of 0.88265

Default Random Forest model gave 0.88133

### SVM

In [44]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

model = SVC(probability=True)

model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

0.8662936413014497


In [45]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

model = SVC(probability=True)
model.fit(X_train_final, y_train_final)

y_final_pred = model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_svm_tuned_trigrams_300features_1minwords_10context_sg.csv')

### More MLP Classifier architectures

In [25]:
param_grid = {
    'hidden_layer_sizes': [(100, 50, 100, 50), (100, 50, 100, 50, 50), (100, 50, 100, 50, 50, 50), (100, 50, 100, 50, 50, 100, 50, 50), (100, 100, 100, 100, 100, 100, 100, 100, 100, 50), (100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': np.linspace(0.0001, 0.05, num=4),
    'learning_rate': ['constant','adaptive']
}

mlp_grid_cv = GridSearchCV(
    MLPClassifier(max_iter=500),
    param_grid,
    scoring='roc_auc',
    n_jobs=-1,
    cv=5,
    verbose=2
)

In [26]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

mlp_grid_cv.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=constant, solver=sgd; total time=  20.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=adaptive, solver=adam; total time=  22.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  44.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  33.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  28.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=adam; total time=  12.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=constant, solver=sgd; total time=  15.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  41.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  38.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  25.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  21.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=adam; total time=  37.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100, 100, 50), learning_rate=constant, solver=sgd; total time

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=constant, solver=adam; total time=  22.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=adaptive, solver=adam; total time=  29.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=constant, solver=adam; total time=  28.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=adaptive, solver=adam; total time=  12.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  28.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  19.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=adam; total time=  29.6s
[CV] END activa

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=constant, solver=sgd; total time=  32.8s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  33.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  27.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=constant, solver=adam; total time=  26.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=adaptive, solver=adam; total time=  20.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  34.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=adaptive, solver=adam; total time=  28.7s
[

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=constant, solver=adam; total time=  30.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=adaptive, solver=adam; total time=  37.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  22.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=constant, solver=sgd; total time=  37.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  32.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  33.2s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100, 100, 50), learning_rate=constant, solver=adam; t

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50), learning_rate=adaptive, solver=sgd; total time=  30.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  45.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50), learning_rate=adaptive, solver=adam; total time=  28.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 50), learning_rate=adaptive, solver=sgd; total time=  27.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=constant, solver=sgd; total time=  28.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 50, 100, 50, 50, 100, 50, 50), learning_rate=adaptive, solver=sgd; total time=  30.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100, 100, 50), learning_rate=constant, solver=adam



GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=500), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': array([0.0001    , 0.01673333, 0.03336667, 0.05      ]),
                         'hidden_layer_sizes': [(100, 50, 100, 50),
                                                (100, 50, 100, 50, 50),
                                                (100, 50, 100, 50, 50, 50),
                                                (100, 50, 100, 50, 50, 100, 50,
                                                 50),
                                                (100, 100, 100, 100, 100, 100,
                                                 100, 100, 100, 50),
                                                (100, 100, 100, 100, 100, 100,
                                                 100, 100, 100, 100, 50)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
          

In [28]:
print(mlp_grid_cv.best_params_)

print("ROC AUC Score of Best MLP Classifier Hyperparameter Model:", roc_auc_score(y_test, mlp_grid_cv.best_estimator_.predict_proba(X_test)[:, 1]))

{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (100, 50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
ROC AUC Score of Best MLP Classifier Hyperparameter Model: 0.8701184844352692


In [29]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

mlp_grid_cv.best_estimator_.fit(X_train_final, y_train_final)

y_final_pred = mlp_grid_cv.best_estimator_.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_mlp_2_trigrams_300features_1minwords_10context_sg.csv')

