In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import json
import urllib
import string
import re
import nltk
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm
tqdm.pandas()

### Utility Functions

In [2]:
def preprocess_webpage_description(description, lemmatize=False):
    '''
        Function to convert a raw webpage description to a string of words
        The input is a single string (webpage description), and 
        the output is a single string (a preprocessed webpage description)
    '''

    # 1. Remove HTML
    words = BeautifulSoup(description).get_text() 

    # 2. Remove non-letters        
    words = re.sub("[^a-zA-Z]", " ", words) 

    # 3. Convert to lower case, split into individual words
    words = words.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    words = [w for w in words if not w in stops]   
    
    # 6. Stem or Lemmatize the words
    if lemmatize == False:
        porter = PorterStemmer()
        words = [porter.stem(word) for word in words]
    else:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # 7. Join the words back into one string separated by space and return the result.
    return " ".join(words)


def generate_vectorized_data(data_input, vectorizer_name='tfidf', lemmatize=False, max_words_in_vocab=None, vocabulary=None):
    '''
        Takes in dataset input, uses webpageDescription column applies NLP preprocessing on it
        Then gives it to the specified vectorizer and returns vectorized data.
        
        Parameters
        -----------------
        data_input: Dataframe of dataset
        
        vectorizer_name: Can be 'tfidf' or 'count'
        
        lemmatize: True => Data should be lemmatized, False => Data should be stemmed
        
        max_words_in_vocab: Value affects max_features parameter of vectorizer used, if None => all words are used
        
        vocabulary: Custom vocabulary that will be given to the vectorizer as input, if None => vocabulary is determined by the vectorizer

        Returns
        ----------------
        vectorized_data: Dataframe of vectorized data
        vectorizer: Vectorizer that was used to fit the training data on
    '''
    data = data_input.copy(deep=True)
    
    print("Cleaning webpage description...")
    # Preprocess each description in the column according to the function described above
    data['webpageDescription'] = data['webpageDescription'].progress_apply(lambda x: preprocess_webpage_description(x, lemmatize))
    
    # Initialize vectorizer according to input parameters
    if vectorizer_name == "tfidf":
        vectorizer = TfidfVectorizer(max_features=max_words_in_vocab, vocabulary=vocabulary)
    elif vectorizer_name == "count":
        vectorizer = CountVectorizer(max_features=max_words_in_vocab)

    print("Applying vectorizer...")
    
    train_data = data[data['label'].isna() == False]
    test_data = data[data['label'].isna() == True]

    # Apply vectorizer to the data
    # Fit vectorizer on the train data and then transform the test data (avoids data leakages)
    vectorized_train_data = vectorizer.fit_transform(train_data['webpageDescription']).toarray()
    vectorized_test_data = vectorizer.transform(test_data['webpageDescription']).toarray()
        
    vectorized_webpage_description = np.concatenate((vectorized_train_data, vectorized_test_data))
 
    # Converting data to a DataFrame so that it can be processed later more easily
    vectorized_webpage_description = pd.DataFrame(vectorized_webpage_description)
    print("Finished vectorization")
    
    return vectorized_webpage_description, vectorizer

def preprocessing(dataset_input, vectorized_data, features_to_use=[]):
    '''
        Takes in dataset and the vectorized data, concatenates them
        It then takes a susbet of the features specified in 'features_to_use' parameter list
    
        Parameters
        --------------------
        dataset_input: Original input data
        
        vectorized_data: webpage description data that is output of generate_vectorized_data()
        
        features_to_use: Which features to use for training, features_to_use = [] => Use all features
    
        Returns
        --------------------
        processed_data: Concatenated dataframe of original data and vectorized_data by taking subset of features_to_use
    
    '''
    if features_to_use == []:
        features_to_use = dataset_input.columns
    
    # Required features that must always be present
    if 'id' not in features_to_use:
        features_to_use.append('id')
        
    if 'label' not in features_to_use:
        features_to_use.append('label')
    
    # Doing a copy so that the input dataset remains intact
    dataset = dataset_input.copy(deep=True)
    dataset = dataset[features_to_use]
    
    processed_data = pd.concat([dataset, vectorized_data], axis=1)
        
    return processed_data

def preparing_data_for_training(dataset, random_state=42):
    '''
        Takes in the dataset as input which is the output of the preprocessing() function call
        Applies get_dummies on the categorical columns
        Removes webpageDescription & id from the data because they are not required for training
        Applies train_test_split with test_size = 0.3
        Applies StandardScaler by fitting on X_train and transforming both X_train & X_test
        
        Returns
        -----------------------------
        X_train, X_test, y_train, y_test
    '''
    train_data = dataset[dataset['label'].isna() == False]
    
    cur_dataset = train_data.copy(deep=True)
    
    temp_numeric_features = ['alchemy_category_score', 'avgLinkWordLength', 'AvglinkWithOneCommonWord',
                          'AvglinkWithFourCommonWord', 'redundancyMeasure', 'frameTagRatio',
                          'tagRatio', 'imageTagRatio', 'hyperlinkToAllWordsRatio',
                          'alphanumCharCount', 'linksCount', 'wordCount',
                          'parametrizedLinkRatio', 'spellingErrorsRatio'
                         ]
    numeric_features = []
    
    temp_cat_features = ['alchemy_category', 'domainLink', 'isNews', 'isFrontPageNews', 'lengthyDomain', 'websiteName']
    cat_features = []
    
    # Only consider those numeric and categorical features which are actually present in the dataset, i.e. being used for training
    for feature in cur_dataset.columns:
        if feature in temp_cat_features:
            cat_features.append(feature)
        elif feature in temp_numeric_features:
            numeric_features.append(feature)
    
    # Get dummies on categorical columns
    cur_dataset = pd.get_dummies(cur_dataset, columns=cat_features, drop_first=True)

    X = cur_dataset.drop(['label', 'webpageDescription', 'id'], axis=1)
    y = cur_dataset['label']
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
        
    scaler = StandardScaler()
    
    # Standard Scaler requires us to save the changes in a copy instead of the original dataframe so that's why these copies are made
    X_train_copy = X_train.copy(deep=True)
    X_test_copy = X_test.copy(deep=True)
    
    scaler = StandardScaler()
    # Feature Standardization
    for feature in numeric_features:
        scaler.fit(X_train_copy[[feature]])
        X_train_copy[feature] = scaler.transform(X_train_copy[[feature]])
        X_test_copy[feature] = scaler.transform(X_test_copy[[feature]])
    
    return X_train_copy, X_test_copy, y_train, y_test

def preparing_data_for_final_submission(dataset):        
    '''
        Apply get_dummies and feature standardization on the entire data (train.csv + test.csv)
        Separates out train.csv and test.csv data from this processed data
        Returns X_train (that has been processed from train.csv), y_train (from train.csv) & X_test (that has been processed from test.csv)
        
        Returns
        ----------------
        X_train, y_train, X_test
    '''
    cur_dataset = dataset.copy(deep=True)
    
    temp_numeric_features = ['alchemy_category_score', 'avgLinkWordLength', 'AvglinkWithOneCommonWord',
                          'AvglinkWithFourCommonWord', 'redundancyMeasure', 'frameTagRatio',
                          'tagRatio', 'imageTagRatio', 'hyperlinkToAllWordsRatio',
                          'alphanumCharCount', 'linksCount', 'wordCount',
                          'parametrizedLinkRatio', 'spellingErrorsRatio'
                         ]

    temp_cat_features = ['alchemy_category', 'domainLink', 'isNews', 'isFrontPageNews', 'lengthyDomain', 'websiteName']

    numeric_features = []
    cat_features = []
    
    # Only consider those numeric and categorical features which are actually present in the dataset
    for feature in cur_dataset.columns:
        if feature in temp_cat_features:
            cat_features.append(feature)
        elif feature in temp_numeric_features:
            numeric_features.append(feature)
    
    # Get dummies on categorical columns
    cur_dataset = pd.get_dummies(cur_dataset, columns=cat_features, drop_first=True)

    scaler = StandardScaler()
    
    # Feature Standardization
    for feature in numeric_features:
        cur_dataset[feature] = scaler.fit_transform(cur_dataset[[feature]])
    
    train_data = cur_dataset[cur_dataset['label'].isna() == False]
    test_data = cur_dataset[cur_dataset['label'].isna() == True]
    
    X_train = train_data.drop(['label', 'webpageDescription', 'id'], axis=1)
    y_train = train_data['label']
    # Do not drop "id" from X_test
    X_test = test_data.drop(['label', 'webpageDescription'], axis=1)
    
    return X_train, y_train, X_test

def generate_csv_submission(test, y_final_pred, output_file_name='submission.csv'):
    '''
        Parameters
        -----------------------
        test: Test data that contains id column
        
        y_final_pred: predict_proba() output for given model and test data
        
        output_file_name: Name of submission output file
    '''
    submission_df = pd.DataFrame()
    submission_df["id"] = test["id"]
    submission_df["label"] = y_final_pred
    submission_df.to_csv(output_file_name, index=False)

def end_to_end_run(data, model, vectorized_data=None, max_words_in_vocab=None, vocabulary=None, features_to_use=[], lemmatize=True, vectorizer_name='tfidf', output_file_name='submission.csv'):
    '''
        Utility function that does the entire modelling from start to finish
        Vectorizes the data, fit it on the given model and generate submission file
        
        Parameters
        --------------------
        data: Input dataset
        
        model: Model to train data on
        
        vectorized_data: Output of generate_vectorized_data(), if None => this function will call vectorized_data()
        If not None => max_words_in_vocab and vocabulary are ignored
        
        max_words_in_vocab: Used as max_features parameter value for vectorizing
        
        vocabulary: Used as vocabulary parameter value for vectorizing
        
        features_to_use: Which features to use for training
    '''
    if vectorized_data == None:
        vectorized_data, _ = generate_vectorized_data(data, vectorizer_name, lemmatize, max_words_in_vocab, vocabulary)
    
    processed_data = preprocessing(data, vectorized_data, features_to_use)
    
    X_train_final, y_train_final, test = preparing_data_for_final_submission(processed_data)
    model.fit(X_train_final, y_train_final)

    # Drop 'id' before sending for training
    y_final_pred = model.predict_proba(test.drop('id', axis=1))[:, 1]
    
    generate_csv_submission(test, y_final_pred, output_file_name)

### Preprocessing

In [3]:
def use_body_key(x):
    if x['body'] == None:
        return x['title']
    
    return x['body']

# Generate the websiteName feature
def generate_website_name(urls):
    websites = urls.apply(lambda x: urllib.parse.urlparse(x).netloc)

    # Only retain those website_names with atleast 30 entries, assign all other website names to 'other' general category
    websites_counts = websites.value_counts()
    websites_with_atleast_30 = list(websites_counts[websites_counts > 30].index)
    websites = websites.apply(lambda x: x if x in websites_with_atleast_30 else 'other')

    return websites

train = pd.read_csv('dataset/train_data.csv')

# Drop the only entry which has neither "body" nor "title" in its webpageDescription
train.drop(index=2994, inplace=True)

test = pd.read_csv('dataset/test_data.csv')

merged_data = pd.concat([train, test], ignore_index=True)

# Convert webpageDescription from string to JSON
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: json.loads(x))

# Fill webpageDescription with 'body' key data, if None fill it with 'title' key data
merged_data['webpageDescription'] = merged_data['webpageDescription'].apply(lambda x: use_body_key(x))

# Replace all ? values in isNews and isFrontPageNews by new category 'unknown'
merged_data['isNews'] = merged_data['isNews'].apply(lambda x: 'unknown' if x == '?' else x)
merged_data['isFrontPageNews'] = merged_data['isFrontPageNews'].apply(lambda x: 'unknown' if x == '?' else x)
    
# Assign all ? values in alchemy_category to "unknown" category
merged_data['alchemy_category'] = merged_data['alchemy_category'].apply(lambda x: 'unknown' if x == '?' else x)
    
# For all ? alchemy_category values we assigned them to "unknown" category
# and we are 100% confident of this assignment
# So we substitute alchemy_category_score = 1.0 (100%) for all ? values which correspond to 'unknown' category
merged_data['alchemy_category_score'] = merged_data['alchemy_category_score'].apply(lambda x: 1.0 if x == '?' else float(x))

merged_data['websiteName'] = generate_website_name(merged_data['url'])

merged_data.drop(['framebased', 'embedRatio', 'AvglinkWithTwoCommonWord', 'AvglinkWithThreeCommonWord', 'url'], axis=1, inplace=True)

In [4]:
chi2_vectorized_data = pd.read_csv('tf-idf-vectorized-data-stemming-305-words.csv')

In [5]:
merged_data.columns

Index(['webpageDescription', 'alchemy_category', 'alchemy_category_score',
       'avgLinkWordLength', 'AvglinkWithOneCommonWord',
       'AvglinkWithFourCommonWord', 'redundancyMeasure', 'frameTagRatio',
       'domainLink', 'tagRatio', 'imageTagRatio', 'isNews', 'lengthyDomain',
       'hyperlinkToAllWordsRatio', 'isFrontPageNews', 'alphanumCharCount',
       'linksCount', 'wordCount', 'parametrizedLinkRatio',
       'spellingErrorsRatio', 'label', 'id', 'websiteName'],
      dtype='object')

In [6]:
features = [
    'webpageDescription',
    'avgLinkWordLength',
    'AvglinkWithOneCommonWord',
    'AvglinkWithFourCommonWord',
    'redundancyMeasure',
    'frameTagRatio',
    'isNews',
    'lengthyDomain',
    'hyperlinkToAllWordsRatio',
    'frameTagRatio',
    'hyperlinkToAllWordsRatio',
    'alphanumCharCount',
    'linksCount',
    'wordCount',
    'spellingErrorsRatio',
    'websiteName'
]

processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)

In [7]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

In [8]:
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]

roc_auc_score(y_test, y_pred)

0.8513472335433347

### Tentative list of features selected for now

In [9]:
features = [
    'webpageDescription',
    'avgLinkWordLength',
    'AvglinkWithOneCommonWord',
    'AvglinkWithFourCommonWord',
    'redundancyMeasure',
    'frameTagRatio',
    'isNews',
    'lengthyDomain',
    'hyperlinkToAllWordsRatio',
    'alphanumCharCount',
    'linksCount',
    'wordCount'
]

processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

### Logistic Regression

In [217]:
lr_param_grid = {
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'C' : np.logspace(-4, 4, 20)
}

lr_cv_obj = GridSearchCV(LogisticRegression(max_iter=5000), lr_param_grid, n_jobs=-1, cv=5, verbose=0, scoring='roc_auc')

lr_cv_obj.fit(X_train, y_train)

print(lr_cv_obj.best_params_)

print("ROC AUC Score of Best Logistic Regression Hyperparameter Model:", roc_auc_score(y_test, lr_cv_obj.best_estimator_.predict_proba(X_test)[:, 1]))

{'C': 0.615848211066026, 'penalty': 'l2', 'solver': 'liblinear'}
ROC AUC Score of Best Logistic Regression Hyperparameter Model: 0.8536173811029261


    Slightly better than what we obtained with default parameters 0.8531

### Multinomial Naive Bayes

Uses only webpageDescription as the model throws error for negative values in X

In [232]:
nb_param_grid = {
    'alpha': np.linspace(0, 1, num=50)
}


processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=['webpageDescription'])

X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

nb_cv_obj = GridSearchCV(MultinomialNB(), nb_param_grid, n_jobs=-1, cv=5, verbose=0, scoring='roc_auc')

nb_cv_obj.fit(X_train, y_train)

print(nb_cv_obj.best_params_)

print("ROC AUC Score of Best Multinomial Naive Bayes Hyperparameter Model:", roc_auc_score(y_test, nb_cv_obj.best_estimator_.predict_proba(X_test)[:, 1]))

{'alpha': 0.02040816326530612}
ROC AUC Score of Best Logistic Regression Hyperparameter Model: 0.8510006170535314


### Random Forest

RandomizedSearch with 50 iterations because GridSearch is more time consuming

In [240]:
rf_param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)], 
    'min_samples_split': list(range(2,40)), 
    'min_samples_leaf': list(range(1,25))
}

processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)

X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

rf_cv_obj = RandomizedSearchCV(RandomForestClassifier(), rf_param_grid, n_jobs=-1, cv=5, n_iter=50, verbose=5, scoring='roc_auc')

rf_cv_obj.fit(X_train, y_train)

print(rf_cv_obj.best_params_)

print("ROC AUC Score of Best Random Forest Hyperparameter Model:", roc_auc_score(y_test, rf_cv_obj.best_estimator_.predict_proba(X_test)[:, 1]))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'n_estimators': 1200, 'min_samples_split': 27, 'min_samples_leaf': 3}
ROC AUC Score of Best Random Forest Hyperparameter Model: 0.8542534794963015


### SVM

In [244]:
svm_param_grid = {
    'gamma': ['scale', 'auto'], 
    'shrinking': [True, False],
    'C': np.logspace(-3, 5, 25)
}

processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)

X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

svm_cv_obj = RandomizedSearchCV(SVC(probability=True), svm_param_grid, n_jobs=-1, cv=5, n_iter=50, verbose=5, scoring='roc_auc')

svm_cv_obj.fit(X_train, y_train)

print(svm_cv_obj.best_params_)

print("ROC AUC Score of Best SVM Hyperparameter Model:", roc_auc_score(y_test, svm_cv_obj.best_estimator_.predict_proba(X_test)[:, 1]))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'shrinking': False, 'gamma': 'auto', 'C': 21.54434690031882}
ROC AUC Score of Best SVM Hyperparameter Model: 0.8463714459113219


### Training these tuned models for final submission

### 1. Logistic Regression

In [25]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)
lr_model = LogisticRegression(max_iter=5000, solver='liblinear', penalty='l2', C=0.615848211066026)
lr_model.fit(X_train_final, y_train_final)

y_final_pred = lr_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'lr_tuned_305_words.csv')

    Score of 0.85241 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 2. Multinomial Naive Bayes

In [23]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=['webpageDescription'])
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

nb_model = MultinomialNB(alpha=0.02040816326530612)

nb_model.fit(X_train_final, y_train_final)

y_final_pred = nb_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'nb_tuned_305_words.csv')

    Score of 0.84355 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 3. Random Forest

In [26]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

rf_model = RandomForestClassifier(n_estimators=1200, min_samples_split=27, min_samples_leaf=3)
rf_model.fit(X_train_final, y_train_final)

y_final_pred = rf_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'rf_tuned_305_words.csv')

    Score of 0.85201 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 4. SVM

In [27]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=features)
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

svm_model = SVC(gamma='auto', shrinking=False, C=21.54434690031882, probability=True)
svm_model.fit(X_train_final, y_train_final)

y_final_pred = svm_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'svm_tuned_305_words.csv')

    Score of 0.84953 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### Tuned Random Forest on just webpageDescription column

In [28]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=['webpageDescription'])
X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

rf_model = RandomForestClassifier(n_estimators=1200, min_samples_split=27, min_samples_leaf=3)
rf_model.fit(X_train_final, y_train_final)

y_final_pred = rf_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'rf_tuned_305_words.csv')

    Score of 0.84584 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### Repeating same experiments but with 10000 words vocabulary

In [30]:
vectorized_data, vectorizer = generate_vectorized_data(merged_data, max_words_in_vocab=10000)

processed_data = preprocessing(merged_data, vectorized_data, features_to_use=['webpageDescription'])

Cleaning webpage description...


100%|██████████████████████████████████████| 7394/7394 [00:28<00:00, 260.56it/s]


Applying vectorizer...
Finished vectorization


### 1. Logistic Regression

In [33]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

lr_model = LogisticRegression(max_iter=5000, solver='liblinear', penalty='l2', C=0.615848211066026)
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

lr_model.fit(X_train_final, y_train_final)

y_final_pred = lr_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'lr_tuned_10000_words.csv')

0.8563268301477628


    Score of 0.86425 which is better than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 2. Multinomial NB

In [34]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

nb_model = MultinomialNB(alpha=0.02040816326530612)
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

nb_model.fit(X_train_final, y_train_final)

y_final_pred = nb_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'nb_tuned_10000_words.csv')

0.8574136569436298


    Score of 0.85601 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 3. Random Forest

In [35]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

rf_model = RandomForestClassifier(n_estimators=1200, min_samples_split=27, min_samples_leaf=3)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

rf_model.fit(X_train_final, y_train_final)

y_final_pred = rf_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'rf_tuned_10000_words.csv')

0.8520290396056951


    Score of 0.85874 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### 4. SVM

In [36]:
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

svm_model = SVC(gamma='auto', shrinking=False, C=21.54434690031882, probability=True)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

svm_model.fit(X_train_final, y_train_final)

y_final_pred = svm_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'svm_tuned_10000_words.csv')

0.8544071080503902


    Score of 0.86206 which is worse than when we submitted with default parameters and webpageDescription only
    for Logistic Regression (0.86312)

### Trying out MLP Classifier

In [13]:
processed_data = preprocessing(merged_data, chi2_vectorized_data, features_to_use=['webpageDescription'])
X_train, X_test, y_train, y_test = preparing_data_for_training(processed_data, random_state=42)

mlp_model = MLPClassifier(max_iter=5000)
mlp_model.fit(X_train, y_train)

y_pred = mlp_model.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, y_pred))

X_train_final, y_train_final, X_test_final = preparing_data_for_final_submission(processed_data)

mlp_model.fit(X_train_final, y_train_final)

y_final_pred = mlp_model.predict_proba(X_test_final.drop('id', axis=1))[:, 1]

generate_csv_submission(X_test_final, y_final_pred, 'mlp_tuned_10000_words.csv')

0.8117199541907585
