In [29]:
import numpy as np
import pandas as pd


import json
import urllib
import string
import re
import nltk
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
tqdm.pandas()

In [130]:
train = pd.read_csv('dataset/train_data.csv')
# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

### NLP Processing

Reference: https://www.kaggle.com/c/word2vec-nlp-tutorial/overview/part-1-for-beginners-bag-of-words

In [131]:
def preprocess_webpage_description(description):
    # Function to convert a raw webpage description to a string of words
    # The input is a single string (webpage description), and 
    # the output is a single string (a preprocessed webpage description)

    # 1. Remove HTML
    description_text = BeautifulSoup(description).get_text() 

    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", description_text) 

    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Stem the words
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in meaningful_words]
    
    # 7. Join the words back into one string separated by space and return the result.
    return " ".join(stemmed_words)

    
def nlp_preprocessing(webpage_description, vectorizer_name='tfidf', max_words_in_vocab=None):
    
    print("Cleaning webpage description...")
    # Preprocess each description in the column according to the function described above
    cleaned_webpage_description = webpage_description.progress_apply(lambda x: preprocess_webpage_description(x))
    
    # Initialize vectorizer according to input parameters
    if vectorizer_name == "tfidf":
        vectorizer = TfidfVectorizer(max_features=max_words_in_vocab)
    elif vectorizer_name == "count":
        vectorizer = CountVectorizer(max_features=max_words_in_vocab)

    print("Applying vectorizer...")
    # Apply vectorizer to the data
    vectorized_webpage_description = vectorizer.fit_transform(cleaned_webpage_description)
    
    # Converting data to a DataFrame so that it can be processed later more easily
    vectorized_webpage_description = pd.DataFrame(vectorized_webpage_description.toarray())
    print("Finished vectorization")
    
    return vectorized_webpage_description
    

### General Preprocessing

- Does data format fixing
- Fills in missing values
- Generates websiteName feature
- Drops redundant or invalid columns
- Does NLP preprocessing

In [132]:
def preprocessing(dataset_input, max_words_in_vocab=None):
    
    # Columns to drop
    # framebased because its all 0s
    # url because after generating websiteName feature we can drop it
    # others because they are highly correlated with other features in the dataset
    features_to_drop = ['framebased', 'embedRatio', 'AvglinkWithTwoCommonWord', 'AvglinkWithThreeCommonWord', 'url']
    
    # Doing a copy so that the input dataset remains intact
    dataset = dataset_input.copy(deep=True)
    
    # Convert webpageDescription from string to JSON
    dataset['webpageDescription'] = dataset['webpageDescription'].apply(lambda x: json.loads(x))
    
    # Replace webpageDescription by its "body" content, if there's no "body" content, then replace by "title" content
    dataset['webpageDescription'] = dataset['webpageDescription'].apply(lambda x: x['title'] if x['body'] == None else x['body'])

    # Generate the websiteName feature
    dataset['websiteName'] = dataset['url'].apply(lambda x: urllib.parse.urlparse(x).netloc)

    # Only retain those website_names with atleast 30 entries, assign all other website names to 'other' general category
    website_names = dataset['websiteName'].value_counts()
    websitesWithAtleast30Entries = list(website_names[website_names > 30].index)
    dataset['websiteName'] = dataset['websiteName'].apply(lambda x: x if x in websitesWithAtleast30Entries else 'other')

    # Drop the following columns,
    dataset.drop(features_to_drop, axis=1, inplace=True)
    
    # Vectorize the webpageDescription data
    # Specify the name of vectorizer as "tfidf" or "count" for CountVectorizer
    # Can also pass in the maximum words to be retained in vocabulary, otherwise vectorizer will consider all the words in the vocabulary
    # max_words_in_vocab=5000 will consider the 5000 most frequently occurring words in the dataset
    vectorized_data = nlp_preprocessing(dataset['webpageDescription'], vectorizer_name='tfidf', max_words_in_vocab=max_words_in_vocab)
        
    processed_data = pd.concat([dataset, vectorized_data], axis=1)
    
    # CODE THAT REPLACES THE ? VALUES
    
    # Replace all ? values in isNews and isFrontPageNews by new category 'unknown'
    processed_data['isNews'] = processed_data['isNews'].apply(lambda x: 'unknown' if x == '?' else x)
    processed_data['isFrontPageNews'] = processed_data['isFrontPageNews'].apply(lambda x: 'unknown' if x == '?' else x)
    
    # Assign all ? values in alchemy_category to "unknown" category
    processed_data['alchemy_category'] = processed_data['alchemy_category'].apply(lambda x: 'unknown' if x == '?' else x)
    
    # For all ? alchemy_category values we assigned them to "unknown" category
    # and we are 100% confident of this assignment
    # So we substitute alchemy_category_score = 1.0 (100%) for all ? values which correspond to 'unknown' category
    processed_data['alchemy_category_score'] = processed_data['alchemy_category_score'].apply(lambda x: 1.0 if x == '?' else float(x))
        
    return processed_data

In [133]:
processed_data = preprocessing(merged_data, max_words_in_vocab=10000)

Cleaning webpage description...


100%|██████████████████████████████████████| 7394/7394 [00:27<00:00, 267.73it/s]


Applying vectorizer...
Finished vectorization


### Train test split for regular training

This function does the following,
- Takes in the combined processed dataset as input
- Applies get_dummies on the categorical columns
- Removes webpageDescription & id from the data because they are not required for training
- Applies train_test_split with test_size = 0.3
- Applies StandardScaler by fitting on X_train and transforming both X_train & X_test
- Returns X_train, X_test, y_train, y_test

In [134]:
def preparing_data_for_training(dataset, random_state=42):
    train_data = dataset[dataset['label'].isna() == False]
    
    cur_dataset = train_data.copy(deep=True)
    
    numerical_features = ['alchemy_category_score', 'avgLinkWordLength', 'AvglinkWithOneCommonWord',
                          'AvglinkWithFourCommonWord', 'redundancyMeasure', 'frameTagRatio',
                          'tagRatio', 'imageTagRatio', 'hyperlinkToAllWordsRatio',
                          'alphanumCharCount', 'linksCount', 'wordCount',
                          'parametrizedLinkRatio', 'spellingErrorsRatio'
                         ]

    cat_features = ['alchemy_category', 'domainLink', 'isNews', 'isFrontPageNews', 'lengthyDomain', 'websiteName']

    # Get dummies on categorical columns
    cur_dataset = pd.get_dummies(cur_dataset, columns=cat_features, drop_first=True)

    X = cur_dataset.drop(['label', 'webpageDescription', 'id'], axis=1)
    y = cur_dataset['label']
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
        
    scaler = StandardScaler()
    
    # Standard Scaler requires us to save the changes in a copy instead of the original dataframe so that's why these copies are made
    X_train_copy = X_train.copy(deep=True)
    X_test_copy = X_test.copy(deep=True)
    
    scaler = StandardScaler()
    # Feature Standardization
    for feature in numerical_features:
        scaler.fit(X_train_copy[[feature]])
        X_train_copy[feature] = scaler.transform(X_train_copy[[feature]])
        X_test_copy[feature] = scaler.transform(X_test_copy[[feature]])
    
    return X_train_copy, X_test_copy, y_train, y_test

### Train-test Split for final submission

Very similar to the above function with the few changes being,
- There is no actual train_test_split() call here as we use the full train.csv data
- Apply get_dummies and feature standardization on the entire data (train.csv + test.csv)
- Separates out train.csv and test.csv data from this processed data
- Returns X_train (that has been processed from train.csv), y (from train.csv) & X_test (that has been processed from test.csv)

In [135]:
def preparing_data_for_final_submission(dataset):    
    cur_dataset = dataset.copy(deep=True)
    
    numerical_features = ['alchemy_category_score', 'avgLinkWordLength', 'AvglinkWithOneCommonWord',
                          'AvglinkWithFourCommonWord', 'redundancyMeasure', 'frameTagRatio',
                          'tagRatio', 'imageTagRatio', 'hyperlinkToAllWordsRatio',
                          'alphanumCharCount', 'linksCount', 'wordCount',
                          'parametrizedLinkRatio', 'spellingErrorsRatio'
                         ]

    cat_features = ['alchemy_category', 'domainLink', 'isNews', 'isFrontPageNews', 'lengthyDomain', 'websiteName']

    # Get dummies on categorical columns
    cur_dataset = pd.get_dummies(cur_dataset, columns=cat_features, drop_first=True)

    scaler = StandardScaler()
    
    # Feature Standardization
    for feature in numerical_features:
        cur_dataset[feature] = scaler.fit_transform(cur_dataset[[feature]])
    
    train_data = cur_dataset[cur_dataset['label'].isna() == False]
    test_data = cur_dataset[cur_dataset['label'].isna() == True]
    
    X_train = train_data.drop(['label', 'webpageDescription', 'id'], axis=1)
    y_train = train_data['label']
    # Do not drop "id" from X_test
    X_test = test_data.drop(['label', 'webpageDescription'], axis=1)
    
    return X_train, y_train, X_test

### Example use-case of preparing_data_for_training() for model training

In [136]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=69)

In [137]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(roc_auc_score(y_test, y_pred))

0.796701978279803


### Example use-case of creating final submission using preparing_data_for_final_submission()

In [138]:
X_train_final, y_train_final, test = preparing_data_for_final_submission(processed_data)

In [142]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train_final, y_train_final)

# Drop 'id' before sending for training
y_final_pred = model.predict(test.drop('id', axis=1))

In [143]:
# Preparing file to be submitted
submission_df = pd.DataFrame()
submission_df["id"] = test["id"]
submission_df["label"] = y_final_pred
submission_df.to_csv("Submission_1.csv", index=False)