# Classification: Now with Text :D

Kaggle StumbleUpon Competition

https://www.kaggle.com/c/stumbleupon

** Competition **: 
1. Some web pages, such as news articles or seasonal recipes, are only relevant for a short period of time. Others continue to be important for a long time.
2. The goal is to identify pages which pages will be relevant for a short span of time, and which will be relevant for a long span on time and are thus considered "evergreen". 

** Evaluation **: Area under the curve (AUC) 


Import Python Modules 
=================

In [13]:
# quick hack to fix import path
# import sys; sys.path.append('/Users/julianalverio/code/conda/envs/sac/lib/python3.6/site-packages/')

# data manipulation
import pandas as pd
import numpy as np

# plots
%matplotlib inline
import random
import matplotlib
import matplotlib.pyplot as plt
import pylab as pl

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# dimensionality reduction
from sklearn.decomposition import PCA

# cross-validation
from sklearn.model_selection import train_test_split
from sklearn import model_selection

# text features
import re
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# model evaluation
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import os
os.chdir(os.path.join("..", "data"))

In [14]:
os.getcwd()

'/home/evan/Documents/class/urugya/machine_learning_aws/data'

# 1. Data Import

In [15]:
data = pd.read_table("stumbleupon/train.tsv", sep= "\t")

## Using Numerical Features (same as last week)

In [16]:
# Alchemy category, converting to one-hots
df = data['alchemy_category']   # 2K ? values
one_hots = pd.get_dummies(data['alchemy_category'])
df = one_hots
rename_dict = {'?': 'alchemy_cat_?'}
df = df.rename(columns=rename_dict)

# FrameTagRatio, leaving as continuous number
df_var = data['frameTagRatio']
df['frame_tag_ratio'] = df_var



# link word score, 0-100 gaussian, keeping continuous
df['link_word_score'] = data['linkwordscore']


# alchemy category score, with replacing missing values with random
df_var = data['alchemy_category_score']
df_var_temp = df_var.apply(lambda x: np.random.random() if x == '?' else float(x)).astype('float32')
df['alchemy_category_score'] = df_var_temp


# num word in url -- discrete 0-25 to custom binning from looking at the histogram
df_var = data['numwords_in_url']
bins = [0, 6, 8, 13, 25]
df_var_temp = pd.cut(x=df_var, bins=bins, right=True, labels=['num_words_url_bin_0', 'num_words_url_bin_1', 'num_words_url_bin_2', 'num_words_url_bin_3'])
dummies = pd.get_dummies(df_var_temp)
df = pd.concat([df, dummies], axis=1)


# parameterized_link_ratio -- leaving as continuous, right-half gaussian
df['parameterized_link_ratio'] = data['parametrizedLinkRatio']

# spelling errors ratio -- leaving as continuous
df['spelling_errors_ratio'] = data['spelling_errors_ratio']

# embed_ratio -- bimodal continuous binned into 2 bins
df_var = pd.DataFrame(data['embed_ratio'])
df_var = df_var['embed_ratio'].apply(lambda x: 1 if x > -1 else 0)
dummies = pd.get_dummies(df_var)
rename = {0: 'embed_ratio_0', 1: 'embed_ratio_1'}
dummies = dummies.rename(columns=rename)
df = pd.concat([df, dummies], axis=1)


# html_ratio -- leaving continuous
df['html_ratio'] = data['html_ratio']

# lengthy_link_domain
df_var = pd.get_dummies(data['lengthyLinkDomain'])
rename = {0: 'lengthy_link_domain_0', 1: 'lengthy_link_domain_1'}
df_var = df_var.rename(columns=rename)
df = pd.concat([df, df_var], axis=1)

df['labels'] = data['label']


In [32]:
train, val = train_test_split(df, test_size=0.5, train_size=0.5, random_state=234)
val, test = train_test_split(val, test_size=0.5, train_size=0.5, random_state= 675)
train_labels = train['labels']
train = train.drop(['labels'], axis=1, inplace=False)
val_labels = val['labels']
val = val.drop(['labels'], axis=1, inplace=False)
test_labels = test['labels']
test = test.drop(['labels'], axis=1, inplace=False)

In [33]:
train

Unnamed: 0,alchemy_cat_?,arts_entertainment,business,computer_internet,culture_politics,gaming,health,law_crime,recreation,religion,...,num_words_url_bin_1,num_words_url_bin_2,num_words_url_bin_3,parameterized_link_ratio,spelling_errors_ratio,embed_ratio_0,embed_ratio_1,html_ratio,lengthy_link_domain_0,lengthy_link_domain_1
3897,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0.089744,0.066212,1,0,0.267779,0,1
2099,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0.195652,0.135882,0,1,0.297631,0,1
3152,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0.658537,0.117647,0,1,0.262332,1,0
7342,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.058333,0.077273,0,1,0.203461,0,1
1114,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0.558442,0.073171,0,1,0.271748,0,1
3058,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0.108108,0.130435,0,1,0.254782,1,0
116,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0.142857,0.110995,0,1,0.295865,0,1
6021,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0.266904,0.100503,0,1,0.196145,0,1
5583,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0.049180,0.137615,0,1,0.268491,1,0
988,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0.000000,0.033537,0,1,0.181620,1,0


## Last Time : logistic regression with numericla features : AUC=0.71

In [34]:
model = LogisticRegression()
model.fit(train, train_labels)
preds = model.predict_proba(val)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.7141160557790367

# 1 : adding in textual features

In [72]:
# TFIDF add in the text features with tfidf
idf_dtm = TfidfVectorizer(min_df= 10,  max_features= 1000, strip_accents= "unicode",
                          analyzer= "word", token_pattern= r"\w{1,}", ngram_range=(1, 1), 
                          use_idf= 1, smooth_idf= 1, sublinear_tf= 1)
idf_dtm.fit(data["boilerplate"])
data_text = idf_dtm.transform(data["boilerplate"])
train_text, val_text = train_test_split(data_text, test_size=0.5, train_size=0.5, random_state=234)
val_text, test_text = train_test_split(val_text, test_size=0.5, train_size=0.5, random_state= 675)

In [73]:
# concatenating in panda has a bug somehow, reverting to using numpy for now
train_with_text = np.concatenate((np.array(train), train_text.toarray()), axis=1)
val_with_text   = np.concatenate((np.array(val), val_text.toarray()), axis=1)

In [74]:
model = LogisticRegression()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.8602914850150587

In [75]:
# exercise : try some other models
model = GradientBoostingClassifier()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.8563021699908033

In [76]:
# exercise : try some other models
model = svm.SVC(probability=True)
model.fit(train, train_labels)
preds = model.predict_proba(val)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.6785462723100937

# 2 : using bigram features

In [77]:
# TFIDF add in the text features with tfidf
idf_dtm = TfidfVectorizer(min_df= 10,  max_features= 1000, strip_accents= "unicode",
                          analyzer= "word", token_pattern= r"\w{1,}", ngram_range=(1, 2), 
                          use_idf= 1, smooth_idf= 1, sublinear_tf= 1)
idf_dtm.fit(data["boilerplate"])
data_text = idf_dtm.transform(data["boilerplate"])
train_text, val_text = train_test_split(data_text, test_size=0.5, train_size=0.5, random_state=234)
val_text, test_text = train_test_split(val_text, test_size=0.5, train_size=0.5, random_state= 675)

train_with_text = np.concatenate((np.array(train), train_text.toarray()), axis=1)
val_with_text   = np.concatenate((np.array(val), val_text.toarray()), axis=1)

In [78]:
model = LogisticRegression()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.8599474877670367

In [79]:
# exercise : try some other models
model = GradientBoostingClassifier()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.85072859553178

In [None]:
# didn't really help, maybe wiht some search in exercise it would be more helpful