In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipies or reviews (as opposed to current events)
- Look at some examples

In [2]:
data = pd.read_csv('../../assets/datasets/train.csv', sep='\t', na_values='?')

data['is_news'].fillna(0,inplace=True)
data['alchemy_category'].fillna("unknown",inplace=True)
data['alchemy_category_score'].fillna(0.400001,inplace=True)
data = data.drop("news_front_page", axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7395 entries, 0 to 7394
Data columns (total 26 columns):
url                               7395 non-null object
urlid                             7395 non-null int64
boilerplate                       7395 non-null object
alchemy_category                  7395 non-null object
alchemy_category_score            7395 non-null float64
avglinksize                       7395 non-null float64
commonlinkratio_1                 7395 non-null float64
commonlinkratio_2                 7395 non-null float64
commonlinkratio_3                 7395 non-null float64
commonlinkratio_4                 7395 non-null float64
compression_ratio                 7395 non-null float64
embed_ratio                       7395 non-null float64
framebased                        7395 non-null int64
frameTagRatio                     7395 non-null float64
hasDomainLink                     7395 non-null int64
html_ratio                        7395 non-null float64
image_

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# drop the words
# do dummies of the categoricals
# scale the numericals so we can look at ceofficients
X_norm = pd.get_dummies(data.drop(['label','url','urlid','boilerplate','alchemy_category'], axis = 1))
scaler = MinMaxScaler()
columns = X_norm.columns.values
X_norm = scaler.fit_transform(X_norm)
y = data['label']
# logit regression does regularization allows you to look at the coefficients with logit
logreg = LogisticRegression()
logreg.fit(X_norm,y)

coeffs = pd.DataFrame(logreg.coef_, columns = columns)
coeffs.T


Unnamed: 0,0
alchemy_category_score,-0.599013
avglinksize,0.295322
commonlinkratio_1,0.668745
commonlinkratio_2,0.13677
commonlinkratio_3,1.872509
commonlinkratio_4,-0.768308
compression_ratio,-0.24962
embed_ratio,-0.235
framebased,0.0
frameTagRatio,-2.744576


In [14]:
# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import chi2

X_norm = pd.DataFrame(X_norm, columns = columns)
selector = SelectKBest(f_classif, k=5)
selected_data = selector.fit_transform(X_norm, y)

kbest_columns = selector.get_support()
kbest_columns = X_norm.columns[kbest_columns]
Xbest = pd.DataFrame(selected_data, columns = kbest_columns)
Xbest.head()

Unnamed: 0,commonlinkratio_2,commonlinkratio_3,frameTagRatio,linkwordscore,non_markup_alphanum_characters
0,0.205882,0.048,0.204241,0.24,0.026083
1,0.28877,0.218182,0.222092,0.4,0.023914
2,0.321705,0.122558,0.163008,0.55,0.010772
3,0.1,0.017,0.215686,0.24,0.013162
4,0.222222,0.125926,0.056044,0.14,0.05786


In [23]:
#Recursive feature elimination
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# Commented out to not run twice since it took time
# svc = SVC(kernel="linear", C=1)
# rfe = RFE(estimator=svc, n_features_to_select=10, step=1)
# rfe.fit(X_norm, y)

rfe_cols = rfe.support_
rfe_cols = X_norm.columns[rfe_cols].values
print "Features to keep with RFE"
print rfe_cols

Features to keep with RFE
['avglinksize' 'commonlinkratio_1' 'commonlinkratio_3' 'commonlinkratio_4'
 'frameTagRatio' 'image_ratio' 'linkwordscore'
 'non_markup_alphanum_characters' 'numberOfLinks' 'spelling_errors_ratio']


In [24]:
# Feature engineering to construct new features
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(
    binary=True,  # Create binary features
    stop_words='english', # Ignore common words such as 'the', 'and'
    max_features=50, # Only use the top 50 most common words
)

# This builds a matrix with a row per website (or data point) and column per word (using all words in the dataset)
X2 = v.fit_transform(data['boilerplate']).todense()
X2 = pd.DataFrame(X2, columns=v.get_feature_names())
X2.head()


Unnamed: 0,10,2011,add,best,body,bowl,com,cup,day,don,easy,food,good,great,high,html,ingredients,just,know,large,let,like,little,ll,love,make,minutes,need,new,people,place,really,recipe,recipes,right,salt,small,sugar,think,time,title,url,use,used,using,ve,want,way,work,world
0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1
1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,1
2,0,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,1,1


In [31]:
# Train a logit model
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cross_validation import train_test_split, cross_val_score

def modeler(features):
    X = data[features]
    y = data['label']

    logreg = LogisticRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    idx = ['Green', 'Not Green']
    col = ['Predicted Green', 'Predicted not Green']
    cmdf = pd.DataFrame(cm, index=idx, columns=col)
    print cmdf
    print classification_report(y_test, y_pred)


print "WITH SELECT K BEST ================================"
modeler(kbest_columns)

print "WITH RFE ================================"
modeler(rfe_cols)

print "JUST USING BOILERPLATE DUMMIES ================================"
modeler(rfe_cols)

           Predicted Green  Predicted not Green
Green                  731                  467
Not Green              553                  690
             precision    recall  f1-score   support

          0       0.57      0.61      0.59      1198
          1       0.60      0.56      0.57      1243

avg / total       0.58      0.58      0.58      2441

           Predicted Green  Predicted not Green
Green                  726                  472
Not Green              551                  692
             precision    recall  f1-score   support

          0       0.57      0.61      0.59      1198
          1       0.59      0.56      0.57      1243

avg / total       0.58      0.58      0.58      2441

