In [283]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, hstack, vstack
from functools import lru_cache

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Ridge
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.base import BaseEstimator
from sklearn.ensemble import BaggingClassifier

In [None]:
RAND = 123
PATH_TO_DATA = 'data/'
PATH_TO_SUBMIT = 'submissions/'

In [None]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')

In [None]:
print(len(train_df))
print(len(test_df))

In [None]:
with open(PATH_TO_DATA + "site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)
site_by_ind = [None] * (len(site_dict) + 1)
for site, id in site_dict.items():
    site_by_ind[id] = site

In [None]:
times_name = ['time%s' % i for i in range(1, 11)]
train_df[times_name] = train_df[times_name].apply(pd.to_datetime)
test_df[times_name] = test_df[times_name].apply(pd.to_datetime)

In [None]:
sites_name = ['site{}'.format(i) for i in range(1, 11)]
train_df[sites_name] = train_df[sites_name].fillna(0).astype('int')
test_df[sites_name] = test_df[sites_name].fillna(0).astype('int')

In [101]:
train_df = train_df.sort_values(by='time1')
test_df = test_df.sort_values(by='time1')

In [None]:
y = train_df['target']
train_df = train_df.drop('target', axis=1)

In [None]:
train_df.head()

In [None]:
small_df, small_y = train_df[4000:4500], y[4000:4500]
sum(small_y)

In [191]:
def write_to_submission_file(predicted_labels, out_file, index=None,
                             target='target', index_label="session_id"):
    if index is None:
        index = np.arange(1, predicted_labels.shape[0] + 1)
    predicted_df = pd.DataFrame(predicted_labels,
                                index = index,
                                columns=[target])
    predicted_df.to_csv(PATH_TO_SUBMIT + '/' + out_file, index_label=index_label)
    
def submit(model, X, y, X_test, name='submit'):
    for ind in range(1, 1000):
        name_with_ind = name + '_' + str(ind)
        if name_with_ind not in os.listdir(PATH_TO_SUBMIT):
            name = name_with_ind
            break
    print(name)
    X, X_test, transformers = train_test_features(X, y, X_test)
    model.fit(X, y)
    predict = model.predict_proba(X_test)[:,1]
    log(X=X, model=model, status='submit {}'.format(name))
    write_to_submission_file(predict, name, index=X_test.index)

In [63]:
def generete_bag_of_sites(data):
    full_sites = data[sites_name]
    sites_flatten = full_sites.values.flatten()
    full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                    sites_flatten,
                                    range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]
    return full_sites_sparse

def generete_text_of_sites(data):
    full_sites = data[sites_name]
    return full_sites.apply(lambda x: '#'.join(site_by_ind[site] 
                                               for site in filter(lambda y: y, x)), axis=1)

nano_in_min = 10**9 * 60
bool_to_int = {True: 1, False: 0}

def one_hot_smooth_encode(val, max_val):
    result = np.zeros(max_val)
    ival = int(val)
    result[ival] += 1 - (val - ival)
    result[(ival + 1) % max_val] += (val - ival)
    return pd.Series(result)

def encode_time(data, smooth=3, max_time=24, prefix=''):
    max_val = (max_time + smooth - 1) // smooth
    
    def encode(x):
        return one_hot_smooth_encode(x / smooth, max_val)
    
    result = data.apply(encode)
    result = result.rename(columns={ind: prefix + str(ind) 
                                    for ind in range(max_val)})
    return result

def generate_cat_time_features(data):
    full_times = data[times_name]
    start_time = full_times['time1']
    features = pd.DataFrame(index=data.index)
    
    hours = start_time.apply(lambda x: x.hour + x.minute / 60)
    
    features['is_morning'] = ((hours > 6) & (hours <= 11)).map(bool_to_int)
    features['is_day'] = ((hours > 11) & (hours <= 17)).map(bool_to_int)
    features['is_evning'] = ((hours > 17) & (hours <= 24)).map(bool_to_int)
    features['is_nignt'] = ((hours >= 0) & (hours <= 6)).map(bool_to_int)
    
    smooth_hours = 1
    features = pd.concat([features, 
                          encode_time(hours, 
                                      prefix='smooth{}_hours_'.format(smooth_hours), 
                                      smooth=smooth_hours)], axis=1)
     
    #month = full_times['time1'].apply(lambda x: x.month)
    #features = pd.concat([features, 
    #                      encode_time(month - 1, prefix='smooth2_month_', smooth=4, max_time=12)], axis=1)
    
    dayofweek = pd.get_dummies(start_time.apply(lambda x: x.dayofweek), prefix='day')
    features = pd.concat([features, dayofweek], axis=1)
    
    features['is_day_off'] = (5 <= start_time.apply(lambda x: x.dayofweek)).map(bool_to_int)
    
    features.DESCRIPTION = ' '.join(features.columns)
    return features

def generate_count_time_features(data):
    full_times = data[times_name]
    start_time = full_times['time1']
    features = pd.DataFrame(index=data.index)

    features['month'] = start_time.apply(lambda x: x.month)
    #features['year'] = start_time.apply(lambda x: x.year + x.month / 12)
    features['trand_10000_100_1'] = start_time.apply(lambda x: x.year * 10000 + x.month * 100 + x.day)
    #features['trand_log_10000_100_1'] = start_time.apply(lambda x: np.log(x.year * 10000 + x.month * 100 + x.day))
    #features['trand_1000_100_1'] = start_time.apply(lambda x: x.year * 1000 + x.month * 100 + x.day)
    
    seconds_from_start = train_df[times_name].\
        applymap(lambda x: np.nan if pd.isnull(x) else x.value)
    features['duration'] = (seconds_from_start.max(1) - seconds_from_start.min(1)) / nano_in_min
    #features['sites_count'] = (~full_times.isnull()).sum(1)
    #features['mean_duration'] = features['duration'] / features['sites_count']
    
    features.DESCRIPTION = ' '.join(features.columns)
    return features

def generate_y_prob(text, y, vocab, alpha=10):
    n = len(vocab)
    good_count = np.zeros(n + 1)
    count = np.zeros(n + 1)
    for session, is_good in zip(text, y):
        for site in session.split('#'):
            ind = vocab.get(site, n)
            count[ind] += 1
            if is_good == 1:
                good_count[ind] += 1
    global_mean = good_count.sum() / count.sum()
    prob = (good_count + global_mean * alpha) / (count + alpha)
    return csr_matrix([prob[:-1]])

def generate_all_features(data, y=None, transformers=None, X_test=None):
    description = ''
    X_cat_time = generate_cat_time_features(data)
    X_count_time = generate_count_time_features(data)
    X_text = generete_text_of_sites(data)
    
    description += 'Features_cat_time: {}\n'.format(X_cat_time.DESCRIPTION)
    description += 'Features_count_time: {}\n'.format(X_count_time.DESCRIPTION)
    
    if not transformers:
        transformers = {}
    if 'vecorizer' not in transformers:
        #transformers['vecorizer'] = TfidfVectorizer(max_features=10000,
        #                                            analyzer='word', 
        #                                            token_pattern='[^#]+').fit(X_text)
        #vocab = transformers['vecorizer'].vocabulary_
        #transformers['y_prob'] = generate_y_prob(X_text, y, vocab)
        
        if X_test is not None:
            text_test = generete_text_of_sites(X_test)
            all_text = pd.concat((X_text, text_test))
        else:
            all_text = X_text
        transformers['vecorizer'] = TfidfVectorizer(max_features=8500,
                                                    analyzer='word', 
                                                    token_pattern='[^#]+').fit(all_text)
    X_text_csr = transformers['vecorizer'].transform(X_text)
    description += '{}: {}\n'.format('vecorizer Train+Test', transformers['vecorizer'])
    
    
    if 'TfidfTransformer' not in transformers:
        if X_test is not None:
            cat_test = generate_cat_time_features(X_test)
            all_cat = pd.concat((X_cat_time, cat_test))
        else:
            all_cat = X_cat_time
        transformers['TfidfTransformer'] = TfidfTransformer().fit(X_cat_time)  
    X_cat_time_csr = csr_matrix(transformers['TfidfTransformer'].transform(X_cat_time))
    description += '{}: {}\n'.format('transformer X_cat_time', transformers['TfidfTransformer'])
    
    
    if 'scaler' not in transformers:
        transformers['scaler'] = StandardScaler().fit(X_count_time)
    X_count_time_csr = csr_matrix(
        transformers['scaler'].transform(X_count_time))
    description += '{}: {}\n'.format('scaler for count', transformers['scaler'])

    #X_text_csr = X_text_csr.multiply(transformers['y_prob'])
    #description += 'y_prob\n'
    
    result = hstack((X_cat_time_csr, X_count_time_csr, X_text_csr)).tocsc()
    
    result.DESCRIPTION = description
    result.ANOTATION = X_cat_time.columns.tolist()
    result.ANOTATION += X_count_time.columns.tolist()
    return result, transformers

In [135]:
TEST_SIZE = 0.5
    

def log(file_name='temp_results.txt', last_action_file_name='last_action.txt',
        X=None, model=None, score=None, status=None):
    log_str = 'Status: {}\n'.format(status)
    if score is not None:
        log_str += 'Score = {}\n\n'.format(score)
    if model is not None:
        log_str += 'Model:\n{}\n\n'.format(model)
    if X is not None:
        log_str += 'Features:\n{}\n'.format(X.DESCRIPTION)
    
    with open(file_name, 'a') as fl:
        fl.write('-'*100 + '\n');
        fl.write(log_str)
    
    with open(last_action_file_name, 'w') as fl:
        fl.write(log_str)
        
    return log_str


def train_test_features(X_train, y_train, X_test):
    X_train, transformers = generate_all_features(X_train, y=y_train, X_test=X_test)
    X_test, transformers = generate_all_features(X_test, transformers=transformers)
    return X_train, X_test, transformers


def train_test_features_split(X, y, return_transformers=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=TEST_SIZE)
    X_train, X_test, transformers = train_test_features(X_train, y_train, X_test)
    if return_transformers:
        return X_train, X_test, y_train, y_test, transformers
    else:
        return X_train, X_test, y_train, y_test


def score(model, X, y, X_test=None, y_test=None, write_log=True):
    model.fit(X, y)
    
    if X_test is None and y_test is None: 
        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=TEST_SIZE)
        X_train, X_test, _ = train_test_features(X_train, y_train, X_test)
        
    predict = model.predict_proba(X_test)[:,1]
    auc_score = roc_auc_score(y_test, predict)
    
    if write_log:
        log(model=model, score=auc_score, status='score', X=X)
        
    return auc_score


def cross_score(model, X, y, n_splits=8, n_test=3, write_log=True):
    l = len(y) // n_splits
    auc_score = np.array([score(model, 
                                X[i*l:(i+n_test)*l], y[i*l:(i+n_test)*l],
                                X[(i+n_test+1)*l:], y[(i+n_test+1)*l:],
                                write_log=False)
                          for i in range(n_splits - n_test - 1)])
    if write_log:
        log(model=model, score=auc_score, status='score my fold: {}'.format((n_splits, n_test)), X=X)
    return auc_score

In [68]:
%%time
X, transformers = generate_all_features(train_df, y=y, X_test=test_df)

CPU times: user 1min 37s, sys: 1.17 s, total: 1min 38s
Wall time: 1min 38s


In [119]:
%%time
model = LogisticRegression(C=2, random_state=RAND)
#model = BaggingClassifier(model, n_estimators=20, n_jobs=-1, random_state=RAND)
arr_score = cross_score(model, X, y, n_splits=7, n_test=2)

new_score = arr_score.mean()
print(new_score)
print(arr_score)

print('Diff with last: {}'.format(new_score - last_score))
print('Diff with best: {}'.format(new_score - last_best_score))
last_score = new_score
if last_best_score < new_score:
    last_best_score = new_score

0.943312635078
[ 0.91901113  0.94178664  0.95589899  0.95655378]
Diff with last: 0.014548849201931224
Diff with best: -0.006638073251232335
CPU times: user 4.14 s, sys: 1.29 s, total: 5.43 s
Wall time: 3.17 s


In [None]:
0.949931835196
[ 0.9407109   0.94513859  0.95732408  0.95655378]
Diff with last: 0.0
Diff with best: 0.0
CPU times: user 4.75 s, sys: 1.95 s, total: 6.7 s
Wall time: 3.55 s


In [1]:
new_features = X.ANOTATION
count_features = len(new_features)
if type(model) is LogisticRegression:
    coef = model.coef_[0]
else:
    coef = np.array([m.coef_[0] for m in model.estimators_]).mean(0)
argsort_coef = np.argsort(np.abs(coef))
argsort_features = np.argsort(np.abs(coef[:count_features]))[::-1]

print('Total', len(coef))
for ind_feature in argsort_features:
    name = new_features[ind_feature]
    value = coef[ind_feature]
    num = argsort_coef[ind_feature]
    print('{:7.2f} {:5} {}'.format(value, num, name))

NameError: name 'X' is not defined

In [None]:
submit(model, train_df, y, test_df)