# Classification: Now with Text :D

Kaggle StumbleUpon Competition

https://www.kaggle.com/c/stumbleupon

** Competition **: 
1. Some web pages, such as news articles or seasonal recipes, are only relevant for a short period of time. Others continue to be important for a long time.
2. The goal is to identify pages which pages will be relevant for a short span of time, and which will be relevant for a long span on time and are thus considered "evergreen". 

** Evaluation **: Area under the curve (AUC) 


Import Python Modules 
=================

In [3]:
# quick hack to fix import path
# import sys; sys.path.append('/Users/julianalverio/code/conda/envs/sac/lib/python3.6/site-packages/')

# data manipulation
import pandas as pd
import numpy as np

# plots
%matplotlib inline
import random
import matplotlib
import matplotlib.pyplot as plt
import pylab as pl

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# dimensionality reduction
from sklearn.decomposition import PCA

# cross-validation
from sklearn.model_selection import train_test_split
from sklearn import model_selection

# text features
import re
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# model evaluation
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import os
os.chdir(os.path.join("..", "data"))

In [4]:
os.getcwd()

'/home/ubuntu/machine_learning_aws/data'

# 1. Data Import

In [9]:
! wget -O /home/ubuntu/machine_learning_aws/data/train.tsv "https://www.dropbox.com/s/10ch2yhfk8tyfri/train.tsv?dl=0"
data = pd.read_table("/home/ubuntu/machine_learning_aws/data/train.tsv", sep= "\t")

--2020-01-13 13:38:08--  https://www.dropbox.com/s/10ch2yhfk8tyfri/train.tsv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.1, 2620:100:601c:1::a27d:601
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/10ch2yhfk8tyfri/train.tsv [following]
--2020-01-13 13:38:13--  https://www.dropbox.com/s/raw/10ch2yhfk8tyfri/train.tsv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucb3c76d03568fde68e81c913d07.dl.dropboxusercontent.com/cd/0/inline/AwHu6jPDXSPwttUboRcpgC2IHOjcMBZTawORyg7eCB-tCMSs-E369nHR0eUobWbGQmeGysD-ebRsO50ZWgibvoLfGN5q13if9lYjb1WsQ_KUlpE_j33T-XLCYBePRy3FnFw/file# [following]
--2020-01-13 13:38:14--  https://ucb3c76d03568fde68e81c913d07.dl.dropboxusercontent.com/cd/0/inline/AwHu6jPDXSPwttUboRcpgC2IHOjcMBZTawORyg7eCB-tCMSs-E369nHR0eUobWbGQmeGysD-ebRsO50ZWgibvoLfGN5q13if9lYjb1WsQ_KUlpE_j33T-X

## Using Numerical Features (same as last week)

In [10]:
# Alchemy category, converting to one-hots
df = data['alchemy_category']   # 2K ? values
one_hots = pd.get_dummies(data['alchemy_category'])
df = one_hots
rename_dict = {'?': 'alchemy_cat_?'}
df = df.rename(columns=rename_dict)

# FrameTagRatio, leaving as continuous number
df_var = data['frameTagRatio']
df['frame_tag_ratio'] = df_var



# link word score, 0-100 gaussian, keeping continuous
df['link_word_score'] = data['linkwordscore']


# alchemy category score, with replacing missing values with random
df_var = data['alchemy_category_score']
df_var_temp = df_var.apply(lambda x: np.random.random() if x == '?' else float(x)).astype('float32')
df['alchemy_category_score'] = df_var_temp


# num word in url -- discrete 0-25 to custom binning from looking at the histogram
df_var = data['numwords_in_url']
bins = [0, 6, 8, 13, 25]
df_var_temp = pd.cut(x=df_var, bins=bins, right=True, labels=['num_words_url_bin_0', 'num_words_url_bin_1', 'num_words_url_bin_2', 'num_words_url_bin_3'])
dummies = pd.get_dummies(df_var_temp)
df = pd.concat([df, dummies], axis=1)


# parameterized_link_ratio -- leaving as continuous, right-half gaussian
df['parameterized_link_ratio'] = data['parametrizedLinkRatio']

# spelling errors ratio -- leaving as continuous
df['spelling_errors_ratio'] = data['spelling_errors_ratio']

# embed_ratio -- bimodal continuous binned into 2 bins
df_var = pd.DataFrame(data['embed_ratio'])
df_var = df_var['embed_ratio'].apply(lambda x: 1 if x > -1 else 0)
dummies = pd.get_dummies(df_var)
rename = {0: 'embed_ratio_0', 1: 'embed_ratio_1'}
dummies = dummies.rename(columns=rename)
df = pd.concat([df, dummies], axis=1)


# html_ratio -- leaving continuous
df['html_ratio'] = data['html_ratio']

# lengthy_link_domain
df_var = pd.get_dummies(data['lengthyLinkDomain'])
rename = {0: 'lengthy_link_domain_0', 1: 'lengthy_link_domain_1'}
df_var = df_var.rename(columns=rename)
df = pd.concat([df, df_var], axis=1)

df['labels'] = data['label']


In [11]:
train, val = train_test_split(df, test_size=0.5, train_size=0.5, random_state=234)
val, test = train_test_split(val, test_size=0.5, train_size=0.5, random_state= 675)
train_labels = train['labels']
train = train.drop(['labels'], axis=1, inplace=False)
val_labels = val['labels']
val = val.drop(['labels'], axis=1, inplace=False)
test_labels = test['labels']
test = test.drop(['labels'], axis=1, inplace=False)

## Last Time : logistic regression with numericla features : AUC=0.71

In [12]:
model = LogisticRegression()
model.fit(train, train_labels)
preds = model.predict_proba(val)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.7104607925041362

# 1 : Textual feature with Count Vectorizer (Bag of Words)

- min_df = minimum freuencey cut-off
- max_features = take the top 1000 most common feature
- strip_accents = to handle non english letters
- ngram_range = we are doing bag of word features here

In [13]:
# TFIDF add in the text features with tfidf
unigram_dtm = CountVectorizer(min_df= 10,  max_features= 1000, strip_accents= "unicode",
                          ngram_range=(1, 1))
unigram_dtm.fit(data["boilerplate"])
data_text = unigram_dtm.transform(data["boilerplate"])
train_text, val_text = train_test_split(data_text, test_size=0.5, train_size=0.5, random_state=234)
val_text, test_text = train_test_split(val_text, test_size=0.5, train_size=0.5, random_state= 675)

In [14]:
np.random.choice(unigram_dtm.get_feature_names(), 10)

array(['uses', 'running', 'came', 'stay', 'spinach', 'cupcakes', 'hit',
       'until', 'pan', 'index'], dtype='<U11')

In [15]:
xx = train_text.toarray()
print (xx.shape)
xx

(3697, 1000)


array([[ 2,  2,  0, ...,  2,  3,  0],
       [ 0,  0,  0, ..., 30, 33,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  2,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  7,  1, ...,  0,  0,  0]])

In [16]:
train.shape

(3697, 28)

In [17]:
train_text.toarray().shape

(3697, 1000)

In [18]:
train_with_text = pd.concat([train.reset_index(drop = True), pd.DataFrame(train_text.toarray())], axis=1)
val_with_text = pd.concat([val.reset_index(drop = True), pd.DataFrame(val_text.toarray())], axis=1)
train_with_text.head()

Unnamed: 0,alchemy_cat_?,arts_entertainment,business,computer_internet,culture_politics,gaming,health,law_crime,recreation,religion,...,990,991,992,993,994,995,996,997,998,999
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,3,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,30,33,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,7,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
model = LogisticRegression()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.7942767750375004

## 2. Textual Features with Count Vectorizer (Bi-Gram)
- ngram_range = (2,2) now so we only bi-gram

In [20]:
# TFIDF add in the text features with tfidf
bigram_dtm = CountVectorizer(min_df= 10,  max_features= 1000, strip_accents= "unicode",
                          ngram_range=(2, 2))
bigram_dtm.fit(data["boilerplate"])
data_text = bigram_dtm.transform(data["boilerplate"])
train_text, val_text = train_test_split(data_text, test_size=0.5, train_size=0.5, random_state=234)
val_text, test_text = train_test_split(val_text, test_size=0.5, train_size=0.5, random_state= 675)

train_with_text = pd.concat([train.reset_index(drop = True), pd.DataFrame(train_text.toarray())], axis=1)
val_with_text = pd.concat([val.reset_index(drop = True), pd.DataFrame(val_text.toarray())], axis=1)
train_with_text.head()

Unnamed: 0,alchemy_cat_?,arts_entertainment,business,computer_internet,culture_politics,gaming,health,law_crime,recreation,religion,...,990,991,992,993,994,995,996,997,998,999
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
np.random.choice(bigram_dtm.get_feature_names(), 10)

array(['few minutes', 'can find', 'the oven', 'used to', 'out the',
       'out there', 'during the', 'over the', 'bottom of', 'in his'],
      dtype='<U20')

In [22]:
model = LogisticRegression()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.7734719618841689

## 3. Textual Features with tfidf

In [23]:
# TFIDF add in the text features with tfidf
idf_dtm = TfidfVectorizer(min_df= 10,  max_features= 1000, strip_accents= "unicode", ngram_range=(1, 2))
idf_dtm.fit(data["boilerplate"])
data_text = idf_dtm.transform(data["boilerplate"])
train_text, val_text = train_test_split(data_text, test_size=0.5, train_size=0.5, random_state=234)
val_text, test_text = train_test_split(val_text, test_size=0.5, train_size=0.5, random_state= 675)

train_with_text = pd.concat([train.reset_index(drop = True), pd.DataFrame(train_text.toarray())], axis=1)
val_with_text = pd.concat([val.reset_index(drop = True), pd.DataFrame(val_text.toarray())], axis=1)
train_with_text.head()

Unnamed: 0,alchemy_cat_?,arts_entertainment,business,computer_internet,culture_politics,gaming,health,law_crime,recreation,religion,...,990,991,992,993,994,995,996,997,998,999
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.036841,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0.006448,0.005204,0.0,0.0,0.0,0.025413,0.102643,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.11526,0.0,0.073187,0.0,0.0,0.034443,0.0,0.0,0.0
4,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
np.random.choice(idf_dtm.get_feature_names(), 10)

array(['added', 'heart', 'art', 'fact', 'cup', 'pastry', 'cup of', 'five',
       'is the', 'favorite'], dtype='<U13')

In [25]:
model = LogisticRegression()
model.fit(train_with_text, train_labels)
preds = model.predict_proba(val_with_text)[:,1]
score = roc_auc_score(val_labels, preds)
score

0.8622057009938479