In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import urllib
import string
import re
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
tqdm.pandas()

In [4]:
train = pd.read_csv('dataset/train_data.csv')
# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

# Convert webpageDescription from string to JSON
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: json.loads(x))

In [5]:
def use_title_key(x):
    # Some entries don't have title key, in that case add 'title' key with value as None
    # to avoid KeyError in the next if condition
    x.setdefault('title', None)
    
    if x['title'] == None:
        return x['body']
    
    return x['title']

def use_body_key(x):
    if x['body'] == None:
        return x['title']
    
    return x['body']

# COMMENT OR UNCOMMENT THESE LINES DEPENDING ON WHICH DATA YOU WANT IN THE webpageDescription column
# merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_title_key(x))
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_body_key(x))
print(merged_data['webpageDescription'].isna().sum())

0


word2vec requires a single sentence as input and a sentence is treated as a list of words, so this function returns a list of words

Removing stopwords and numbers can be detrimental to the learning process, so they're not removed here

In [6]:
def preprocess_webpage_description(description, remove_stopwords=False):
    # Function to convert a raw webpage description to a string of words
    # The input is a single string (webpage description), and 
    # the output is a single string (a preprocessed webpage description)

    # 1. Remove HTML
    words = BeautifulSoup(description).get_text() 

    # 2. Remove non-alphanumeric values
    words = re.sub("[^a-zA-Z\d]", " ", words) 

    # 3. Convert to lower case, split into individual words
    words = words.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    if remove_stopwords:
        stops = set(stopwords.words("english"))                  
        words = [w for w in words if not w in stops]   
    
    return words

Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists.

It is not at all straightforward how to split a paragraph into sentences. There are all kinds of gotchas in natural language. English sentences can end with "?", "!", """, or ".", among other things, and spacing and capitalization are not reliable guides either. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting.

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
# Define a function to split a description into parsed sentences
def description_to_sentences(description, tokenizer, remove_stopwords=False):
    # Function to split a description into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(description.strip())
    
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call preprocess_webpage_description to get a list of words
            sentences.append(preprocess_webpage_description(raw_sentence, remove_stopwords))
    
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences