In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import urllib
import string
import re
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB # Doesn't work for Word2Vec because of negative values in word vectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV

from gensim.models import Word2Vec, Phrases

from tqdm import tqdm
tqdm.pandas()

# Logging to display info regarding training of models especially Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [2]:
train = pd.read_csv('dataset/train_data.csv')
# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

# Convert webpageDescription from string to JSON
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: json.loads(x))

### Filling in webpageDescription

    Use the body key value if non-empty
    Else use the title key vallue
    Else use the url key value
    Else just fill it with 'unknown'

In [7]:
def use_body_key(x):
    # strip() function is used to ensure that only blank descriptions don't pass through this condition
    if x['body'] == None or len(x['body'].strip()) == 0:
        if x['title'] == None or len(x['title'].strip()) == 0:
            if x['url'] == None or len(x['url'].strip()) == 0:
                return 'unknown'
            return x['url']
        return x['title']
    
    return x['body']

merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_body_key(x))
print(merged_data['webpageDescription'].isna().sum())

0


### 1. Using URL column to generate websiteName feature

Here is the logic used,
    
    - Extract the domain name out of the URL and create a new categorical column called “websiteName”.
    - There are in total 3372 unique website names in around 7000 entries.
    - We have 19 unique website names that have count > 30 in the dataset, so we can let these website names be 
      as it is and combine all the other website names into "other" category
    - So in total there will be 20 categories in total in websiteName    

In [8]:
# Generate the websiteName feature
def generate_website_name(urls):
    websites = urls.apply(lambda x: urllib.parse.urlparse(x).netloc)

    # Only retain those website_names with atleast 30 entries, assign all other website names to 'other' general category
    websites_counts = websites.value_counts()
    websites_with_atleast_30 = list(websites_counts[websites_counts > 30].index)
    websites = websites.apply(lambda x: x if x in websites_with_atleast_30 else 'other')

    return websites

merged_data['websiteName'] = generate_website_name(merged_data['url'])

These are the 20 unique categories in the websiteName feature

In [9]:
merged_data['websiteName'].value_counts()

other                        6258
www.insidershealth.com        143
sportsillustrated.cnn.com     109
www.huffingtonpost.com         99
allrecipes.com                 93
bleacherreport.com             86
www.youtube.com                85
blogs.babble.com               62
www.ivillage.com               59
www.foodnetwork.com            57
www.dailymail.co.uk            46
www.epicurious.com             36
www.womansday.com              35
www.bbc.co.uk                  34
www.popsci.com                 33
www.guardian.co.uk             33
www.marthastewart.com          33
www.buzzfeed.com               31
itechfuture.com                31
www.collegehumor.com           31
Name: websiteName, dtype: int64

### 2. Using Chi-Squared test to only retain the useful words in the vocabulary

The Chi-Squared test checks whether each feature given in the dataset is independent of the target variable or not by considering each feature individually. But the feature must take non-negative values which in our case is perfectly fine as we're considering TF-IDF values which are always positive.

It computes the p-value for each feature which tells you how effective that feature is (individually) to predict the target variable.

It takes the Null Hypothesis which assumes that the feature and target variable are independent.

Lower the p-value, the better the feature is.

On the basis of this, the p-score is defined as, 1 - p_value

A p_score of 0.95 is typically considered to be a good indicator of a feature being useful for prediction process.

p-value less than 0.05 (typically ≤ 0.05) is statistically significant. It indicates strong evidence against the null hypothesis, as there is less than a 5% probability the null is correct (and the results are random). Therefore, we reject the null hypothesis, and accept the alternative hypothesis.

In our task, we have around 80,000 words in the entire dataset and we prune it further to around 10-40 thousand using the max_features parameter of the vectorizer object to pick the most frequent words.

But we can prune this even further with the help of this test and only choosing those words which have p-score >= 0.95

In [10]:
def preprocess_webpage_description(description, lemmatize=False):
    '''
        Function to convert a raw webpage description to a string of words
        The input is a single string (webpage description), and 
        the output is a single string (a preprocessed webpage description)
    '''

    # 1. Remove HTML
    words = BeautifulSoup(description).get_text() 

    # 2. Remove non-letters        
    words = re.sub("[^a-zA-Z]", " ", words) 

    # 3. Convert to lower case, split into individual words
    words = words.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    words = [w for w in words if not w in stops]   
    
    # 6. Stem or Lemmatize the words
    if lemmatize == False:
        porter = PorterStemmer()
        words = [porter.stem(word) for word in words]
    else:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # 7. Join the words back into one string separated by space and return the result.
    return " ".join(words)


def generate_vectorized_data(data_input, vectorizer_name='tfidf', lemmatize=False, max_words_in_vocab=None, vocabulary=None):
    '''
        Takes in dataset input, uses webpageDescription column applies NLP preprocessing on it
        Then gives it to the specified vectorizer and returns vectorized data.
        
        Parameters
        -----------------
        data_input: Dataframe of dataset
        
        vectorizer_name: Can be 'tfidf' or 'count'
        
        lemmatize: True => Data should be lemmatized, False => Data should be stemmed
        
        max_words_in_vocab: Value affects max_features parameter of vectorizer used, if None => all words are used
        
        vocabulary: Custom vocabulary that will be given to the vectorizer as input, if None => vocabulary is determined by the vectorizer

        Returns
        ----------------
        vectorized_data: Dataframe of vectorized data
    '''
    data = data_input.copy(deep=True)
    
    print("Cleaning webpage description...")
    # Preprocess each description in the column according to the function described above
    data['webpageDescription'] = data['webpageDescription'].progress_apply(lambda x: preprocess_webpage_description(x, lemmatize))
    
    # Initialize vectorizer according to input parameters
    if vectorizer_name == "tfidf":
        vectorizer = TfidfVectorizer(max_features=max_words_in_vocab, vocabulary=vocabulary)
    elif vectorizer_name == "count":
        vectorizer = CountVectorizer(max_features=max_words_in_vocab)

    print("Applying vectorizer...")
    
    train_data = data[data['label'].isna() == False]
    test_data = data[data['label'].isna() == True]

    # Apply vectorizer to the data
    # Fit vectorizer on the train data and then transform the test data (avoids data leakages)
    vectorized_train_data = vectorizer.fit_transform(train_data['webpageDescription']).toarray()
    vectorized_test_data = vectorizer.transform(test_data['webpageDescription']).toarray()
        
    vectorized_webpage_description = np.concatenate((vectorized_train_data, vectorized_test_data))
 
    # Converting data to a DataFrame so that it can be processed later more easily
    vectorized_webpage_description = pd.DataFrame(vectorized_webpage_description)
    print("Finished vectorization")
    
    return vectorized_webpage_description, vectorizer

In [12]:
def preparing_data_for_training(dataset, random_state=42):
    '''
        Takes in the dataset as input which is the output of the preprocessing() function call
        Applies get_dummies on the categorical columns
        Removes webpageDescription & id from the data because they are not required for training
        Applies train_test_split with test_size = 0.3
        Applies StandardScaler by fitting on X_train and transforming both X_train & X_test
        
        Returns
        -----------------------------
        X_train, X_test, y_train, y_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    
    X = train_data.drop(['label', 'id'], axis=1)
    y = train_data['label']
        
    return train_test_split(X, y, test_size=0.3, random_state=random_state)

In [11]:
vectorized_data, vectorizer = generate_vectorized_data(merged_data, lemmatize=False, max_words_in_vocab=10000, vectorizer_name='tfidf')

Cleaning webpage description...


100%|██████████████████████████████████████| 7394/7394 [00:30<00:00, 245.43it/s]


Applying vectorizer...
Finished vectorization


In [14]:
processed_data = pd.concat([merged_data[['label','id']], vectorized_data], axis=1)

X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=69)

vocab_words = vectorizer.get_feature_names()

# Min value of p_score required is p_limit
p_limit = 0.95

In [18]:
# Running the Chi-Squared test and storing the results in a DataFrame and sorting DF by p_score value

chi2_stats, p = chi2(X_train, y_train)

p_score_per_word = pd.DataFrame({"feature":vocab_words, "score":1-p})

# Sorting in descending order of p-score value
p_score_per_word.sort_values('score', ascending=False)

Unnamed: 0,feature,score
7196,recip,1.0
2156,cup,1.0
634,bake,1.0
1208,butter,1.0
1952,cook,1.0
...,...,...
9922,yakuza,
9946,yoder,
9976,zetaclear,
9977,zhao,


In [19]:
p_score_per_word_filtered = p_score_per_word[p_score_per_word['score'] > p_limit]

p_score_per_word_filtered

Unnamed: 0,feature,score
87,add,1.000000
322,android,0.962849
386,app,0.970821
481,asid,0.985957
523,athlet,0.970253
...,...,...
9894,wrap,0.967185
9935,yeast,0.976111
9949,yogurt,0.973375
9974,zest,0.978169


    Filtering by p_score value >= 0.95 resulted in only 305 words remaining, we'll create a new feature matrix
    by using only these words as our input vocabulary.
    
    The new vectorized data was then used to train a Logistic Regression model with base parameters and this 
    actually gave us a score of 0.86302 which is the same as our base Logistic Regression model with 10,000 
    words in the vocabulary.
    
    So even though the accuracy didn’t improve, there was a significant improvement in terms of memory used, 
    time required for training and scalability as well.
