In [1]:
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
PATH_TO_DATA = '../data/alice/'
SEED = 17
time_split = TimeSeriesSplit(n_splits=10)
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

In [62]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    """ Prepares sparsed X_train, X_test, y_train, vectorizer, train_times, test_times
        from input CSV files, pickle file and vectorizer_params dictionary.
    
        return:: X_train, X_test, y_train, vectorizer, train_times, test_times """
    
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    sites_dict = pd.DataFrame(list(site_dict.keys()),
                              index=list(site_dict.values()),
                              columns=['site'])
    top_alice_sites = pd.Series(train_df[train_df['target'] == 1][sites].values.flatten()
                               ).value_counts().sort_values(ascending=False).head(15)
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    # sites_df
    train_sites, test_sites = train_df[sites], test_df[sites]
    
    full_df = pd.concat([train_df.drop('target', axis=1), test_df])
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times, train_sites, test_sites, top_alice_sites

In [63]:
%%time
X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times, train_sites, test_sites, top_alice_sites = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)


Wall time: 34.6 s


In [64]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)
    
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [65]:
%%time
cv_scores1 = train_and_predict(model=logit,
                               X_train=X_train_sites,
                               y_train=y_train,
                               X_test=X_test_sites,
                               site_feature_names=vectorizer.get_feature_names(),
                               cv=time_split, submission_file_name='subm1.csv')

CV scores [0.83124023 0.65993466 0.85673565 0.92824237 0.84779639 0.88954524
 0.88829128 0.8771044  0.92023038 0.92624125]
CV mean: 0.8625361862232206, CV std: 0.0745567081201155


Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


Wall time: 13.3 s


-----------------------------------------

In [66]:
def add_features(times, sites, X_sparse, top_alice_sites):
    
    scaler = StandardScaler()
    
    with open(PATH_TO_DATA + 'site_dic.pkl', "rb") as input_file:
        site_dict = pickle.load(input_file)
        
        
    sites_dict = pd.DataFrame(list(site_dict.keys()),
                              index=list(site_dict.values()),
                              columns=['site'])
    
    # time features
    hour = times['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    durations = (times.max(axis=1) - times.min(axis=1)).astype('timedelta64[ms]').astype(int)
    durations = scaler.fit_transform(durations.values.reshape(-1, 1))
    
    day_of_week = times['time1'].apply(lambda ts: ts.weekday()).astype('int').values.reshape(-1, 1)
    month = times['time1'].apply(lambda ts: ts.month).astype('int').values.reshape(-1, 1)
    year_month = times['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('int').values.reshape(-1, 1)
    year_month = scaler.fit_transform(year_month)
    
    # site features
    sites_dict.loc[top_alice_sites.index]
    top_alice_ids = []

    for key in top_alice_sites.index:
        top_alice_ids.append(key)

    # "in_top"
    in_alice_top = sites['site1'].apply(lambda x: 1 if x in top_alice_ids else 0).values.reshape(-1, 1)

    # stacking matrix
    objects_to_hstack = [X_sparse, morning, day, evening, night, durations, in_alice_top, day_of_week, year_month]
    
    feature_names = ['morning', 'day', 'evening', 'night', 'durations', 'in_alice_top', 'day_of_week', 'year_month']
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [67]:
%%time
X_train, new_feat_names = add_features(train_times, train_sites, X_train_sites, top_alice_sites)
X_test, _ = add_features(test_times, test_sites, X_test_sites, top_alice_sites)

Wall time: 9.2 s


In [68]:
new_feat_names

['morning',
 'day',
 'evening',
 'night',
 'durations',
 'in_alice_top',
 'day_of_week',
 'year_month']

In [69]:
%%time

i = 1
filename = f'submission_{i}.csv'

def get_next_filename(filename, path):
    if filename in os.listdir(path):
        i += 1
        next_ = list(filename.split('.')[0])
        next_[-1] = str(i)
        next_ = ''.join(next_)
        return next_+'.csv'       
    else:
        return filename
    
get_next_filename(filename, './predictions/alice/')
cv_scores2 = train_and_predict(model=logit, X_train=X_train, y_train=y_train, 
                               X_test=X_test, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names,
                               cv=time_split, submission_file_name=filename)

CV scores [0.83783436 0.81400389 0.94528804 0.96318223 0.91569796 0.9597666
 0.92653077 0.95361837 0.95777043 0.9699907 ]
CV mean: 0.9243683354386569, CV std: 0.051959739417870875


Weight?,Feature
+5.118,youwatch.org
+5.010,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.977,www.express.co.uk
+4.947,vk.com
+4.708,www.info-jeunes.net
+4.428,www.melty.fr
+4.384,www.audienceinsights.net
+4.383,fr.glee.wikia.com
+4.048,www.banque-chalus.fr
+3.927,api.bing.com


New feature weights:
        feature      coef
0       morning -3.059317
1           day  0.682459
2       evening -2.619846
3         night  0.000000
4     durations -0.260385
5  in_alice_top  0.091017
6   day_of_week -0.383268
7    year_month -0.444323
Wall time: 18.6 s


'submission_1.csv'

In [57]:
cv_scores1 < cv_scores2
# add site features

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

Got 0.95454 on LeaderBoard