In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import urllib
import string
import re
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB # Doesn't work for Word2Vec because of negative values in word vectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV

from gensim.models import Word2Vec, Phrases

from tqdm import tqdm
tqdm.pandas()

# Logging to display info regarding training of models especially Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [15]:
train = pd.read_csv('dataset/train_data.csv')
# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

# Convert webpageDescription from string to JSON
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: json.loads(x))

### Filling in webpageDescription

    Use the body key value if non-empty
    Else use the title key vallue
    Else use the url key value
    Else just fill it with 'unknown'

In [17]:
def use_body_key(x):
    # strip() function is used to ensure that only blank descriptions don't pass through this condition
    if x['body'] == None or len(x['body'].strip()) == 0:
        if x['title'] == None or len(x['title'].strip()) == 0:
            if x['url'] == None or len(x['url'].strip()) == 0:
                return 'unknown'
            return x['url']
        return x['title']
    
    return x['body']

merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_body_key(x))
print(merged_data['webpageDescription'].isna().sum())

0


In [18]:
def preparing_data_for_training(dataset, random_state=42):
    '''
        Takes in the dataset as input which is the output of the preprocessing() function call
        Applies get_dummies on the categorical columns
        Removes webpageDescription & id from the data because they are not required for training
        Applies train_test_split with test_size = 0.3
        Applies StandardScaler by fitting on X_train and transforming both X_train & X_test
        
        Returns
        -----------------------------
        X_train, X_test, y_train, y_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    
    X = train_data.drop(['label', 'id'], axis=1)
    y = train_data['label']
        
    return train_test_split(X, y, test_size=0.3, random_state=random_state)

def preparing_data_for_final_submission(dataset):        
    '''
        Apply get_dummies and feature standardization on the entire data (train.csv + test.csv)
        Separates out train.csv and test.csv data from this processed data
        Returns X_train (that has been processed from train.csv), y_train (from train.csv) & X_test (that has been processed from test.csv)
        
        Returns
        ----------------
        X_train, y_train, X_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    test_data = dataset[dataset['label'].isna() == True]
    
    X_train = train_data.drop(['label', 'id'], axis=1)
    y_train = train_data['label']
    
    # Do not drop "id" from X_test
    X_test = test_data.drop(['label'], axis=1)
    
    return X_train, y_train, X_test

def generate_csv_submission(test, y_final_pred, output_file_name='submission.csv'):
    '''
        Parameters
        -----------------------
        test: Test data that contains id column
        
        y_final_pred: predict_proba() output for given model and test data
        
        output_file_name: Name of submission output file
    '''
    submission_df = pd.DataFrame()
    submission_df["id"] = test["id"]
    submission_df["label"] = y_final_pred
    submission_df.to_csv(output_file_name, index=False)

word2vec requires a single sentence as input and a sentence is treated as a list of words, so this function returns a list of words

Removing stopwords and numbers can be detrimental to the learning process, so they're not removed here

In [19]:
def preprocess_webpage_description(description, remove_stopwords=False, no_empty_lists=False):
    # Function to convert a raw webpage description to a string of words
    # The input is a single string (webpage description), and 
    # the output is a single string (a preprocessed webpage description)

    # 1. Remove HTML
    words = BeautifulSoup(description).get_text() 

    # 2. Remove non-alphanumeric values
    words = re.sub("[^a-zA-Z\d]", " ", words) 

    # 3. Convert to lower case, split into individual words
    words = words.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    if remove_stopwords:
        stops = set(stopwords.words("english"))                  
        words = [w for w in words if not w in stops]   
    
    return words

### Word2Vec approach: Trigrams data as input sentences

In [20]:
processed_data = merged_data.copy(deep=True)
tokenized_description_data = processed_data['webpageDescription'].progress_apply(lambda x: preprocess_webpage_description(x))

100%|█████████████████████████████████████| 7394/7394 [00:02<00:00, 2700.03it/s]


In [21]:
# tokenized_description_data => Input to Phrases() => Output = Bigrams data
# Bigrams data => Input to Phrases() => Output = Trigrams data

bigrams = Phrases(sentences=tokenized_description_data)
trigrams = Phrases(sentences=bigrams[tokenized_description_data])

2021-12-21 14:12:51,138 : INFO : collecting all words and their counts
2021-12-21 14:12:51,140 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-12-21 14:12:55,789 : INFO : collected 1126483 token types (unigram + bigrams) from a corpus of 3349471 words and 7394 sentences
2021-12-21 14:12:55,790 : INFO : merged Phrases<1126483 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-12-21 14:12:55,791 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1126483 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 4.65s', 'datetime': '2021-12-21T14:12:55.791422', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-40-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-12-21 14:12:55,819 : INFO : collecting all words and their counts
2021-12-21 14:12:55,820 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-12-21 14:13:03,940 : INFO : collected 1277504 

### Training Word2Vec model

In [22]:
# Commenting out to directly use saved model
# num_features = 300
# context = 10

# trigrams_w2v_model = Word2Vec(
#     sentences = trigrams[bigrams[tokenized_description_data]],
#     vector_size=num_features,
#     min_count=1, window=10, workers=4,
#     sg=1
# )

# model_name = 'trigrams_300features_1minwords_10context_sg'

# trigrams_w2v_model.save(model_name)

In [23]:
trigrams_w2v_model = Word2Vec.load('trigrams_300features_1minwords_10context_sg')

2021-12-21 14:13:03,977 : INFO : loading Word2Vec object from trigrams_300features_1minwords_10context_sg
2021-12-21 14:13:04,016 : INFO : loading wv recursively from trigrams_300features_1minwords_10context_sg.wv.* with mmap=None
2021-12-21 14:13:04,017 : INFO : loading vectors from trigrams_300features_1minwords_10context_sg.wv.vectors.npy with mmap=None
2021-12-21 14:13:04,137 : INFO : loading syn1neg from trigrams_300features_1minwords_10context_sg.syn1neg.npy with mmap=None
2021-12-21 14:13:04,261 : INFO : setting ignored attribute cum_table to None
2021-12-21 14:13:05,049 : INFO : Word2Vec lifecycle event {'fname': 'trigrams_300features_1minwords_10context_sg', 'datetime': '2021-12-21T14:13:05.049554', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-40-generic-x86_64-with-glibc2.29', 'event': 'loaded'}


### Averaging word vectors to get feature vector

In [24]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    
    # Pre-initialize an empty numpy array (for speed)
    # This array will contain the sum of all word vectors for the given description
    featureVec = np.zeros((num_features,),dtype="float32")
    
    # This counts the number of words from given description whose word vectors are used
    # to compute the overall word embedding for this description
    nwords = 0.

    # index_to_key is a list that contains the names of the words in the model's vocabulary
    # Convert it to a set, for speed 
    index2word_set = set(model.wv.index_to_key)

    # There are outlier cases where after doing the preprocessing steps, i.e. after removing non-alphanumeric
    # characters there are no words left, so sentence remains an empty list which is a problem for training the
    # word2vec model, so we just return a list containing 'unknown' as the sole word
    # Note: This typically happens with a few entries where description contains only Japanese characters and such
    if len(words) == 0:
        words = ['unknown']
    
    # Loop over each word in the description and if it is in the model's vocabulary,
    # add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            # Add the word vector of given word in featureVec
            featureVec = np.add(featureVec, model.wv[word])
    
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec, nwords)
    
    return featureVec


def getAvgFeatureVecs(descriptions, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
     
    # Preallocate a 2D numpy array, for speed
    descriptionFeatureVecs = np.zeros((len(descriptions),num_features),dtype="float32")
    
    # Loop through the reviews
    for i, description in enumerate(tqdm(descriptions)):
         
        # Call the function (defined above) that makes average feature vectors
        descriptionFeatureVecs[i] = makeFeatureVec(description, model, num_features)
        
    return descriptionFeatureVecs

In [27]:
# num_features = Same as that used for training
vectorized_data = pd.DataFrame(getAvgFeatureVecs(tokenized_description_data, trigrams_w2v_model, num_features=300))

100%|███████████████████████████████████████| 7394/7394 [01:23<00:00, 88.29it/s]


### Concatenate label and id with vectorized data

So that predictions can be made with labelled data

In [28]:
modelling_data = pd.concat([processed_data[['label','id']], vectorized_data], axis=1)

### Training vectorized data on best performing MLP architecture, hidden layers = (100, 50, 100, 50)

In [29]:
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50, 100, 50),
    activation='relu',
    learning_rate='adaptive',
    solver='sgd',
    max_iter=500
)

In [30]:
X_train, X_test, y_train, y_test = preparing_data_for_training(modelling_data)

mlp_model.fit(X_train, y_train)

print("ROC AUC Score of Best MLP Classifier Hyperparameter Model:", roc_auc_score(y_test, mlp_model.predict_proba(X_test)[:, 1]))

ROC AUC Score of Best MLP Classifier Hyperparameter Model: 0.8701375292973462




In [32]:
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(modelling_data)

mlp_model.fit(X_train_final, y_train_final)

y_final_pred = mlp_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'word2vec_mlp_2_trigrams_300features_1minwords_10context_sg.csv')

