# Experiments with sklearn
### ref: https://www.kaggle.com/c/avazu-ctr-prediction/discussion/12314

### Shuffle the training data, in the ``bash`` shell: 
### -- strip the header: ``tail -40429867 train.csv > train.csv``
### -- shuffle the rows, in place: ``shuf -o train.csv < train.csv``

## Build the model, tune the hyperparameters by cross-validation, archive the model

In [7]:
"""
======================================================
Out-of-core classification of  Avazu data
======================================================
wc count for train.csv 40428968
wc count for test.csv   4577465

Feature engineering
Feature hashing
SGD classifier with partial fit
invscaling with eta0 =4-8
l2 penalty
labels scaled and shifted to -1,1 as vowpal wabbit
"""

# Authors: Elena Cuoco <elena.cuoco@gmail.com>

# Avazu competitition using pandas and sklearn library
import numpy as np
import pandas as pd
from datetime import datetime, date, time
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
from sklearn.feature_extraction import FeatureHasher
from sklearn import preprocessing 
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier #note: activate the xgboost environment
from lightgbm import LGBMClassifier


###############################################################################
# create matrix of hashed features and target labels vector


# set the number of hash bins
# 2**29 gets same result as 2**27, 2**30 produces memory error
# n_features = 2**27
n_features = 2**16


# build a preprocessing pipeline for feature hashing
preproc = Pipeline([('fh',FeatureHasher( n_features=n_features,input_type='string', non_negative=False))])

def hash_features(data):
    
    # shift and scale target values to [-1,1] instead of [0,1]
    # y_train = data['click'].values + data['click'].values-1
    y_train = data['click'].values
        
    # remove id and click columns
    data = data.drop(['id','click'], axis=1)
    
    # add engineered features related to datetime
    add_engineered_datetime_features = True
    if(add_engineered_datetime_features):    
        data['hour']=data['hour'].map(lambda x: datetime.strptime(str(x),"%y%m%d%H"))
        data['dayoftheweek']=data['hour'].map(lambda x:  x.weekday())
        data['day']=data['hour'].map(lambda x:  x.day)
        data['hour']=data['hour'].map(lambda x:  x.hour)
        
    # convert all features to str
    X_train = np.asarray(data.astype(str))
    
    # hash the features
    X_train = preproc.fit_transform(X_train)
    
    # target labels
    y_train = np.asarray(y_train).ravel()
    
    # return y_train,X_train
    # return y_train, X_train, data
    return y_train, X_train


###################################################################################
# build and train model


# set start time
start = datetime.now()

# file and folder paths
train_file = 'train_shuffled.csv'
model_path = './'

# chunk_size
chunk_size= 3e5

# positive class fraction
# positive_class_fraction = 0.17

# number of training examples
n_rows = chunk_size

# seed for random generator
seed = 99

# option to archive model
archive_model = False

# column names
header = ['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id',\
        'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']


# list of regularization hyperparameter values
# C_range = [6, 3, 1]
C_range = [1]

# select classifier
classifier = 'lgb'

# loop over hyperparameter values
for hyperparameter in C_range:

    # build classifier
        
    if classifier == 'log':

        # 'sag' solver is recommended for large data sets, does L2 but not L1 regularization
        model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=hyperparameter, \
            fit_intercept=True, intercept_scaling=1, class_weight=None, \
            random_state=seed, solver='sag', max_iter=300, multi_class='ovr', \
            verbose=0, warm_start=False, n_jobs=-1)
        
    elif classifier == 'sgd':
        model = SGDClassifier(loss='log', tol=1.e-3, alpha=.0000001, penalty='l2',\
           shuffle=False,n_jobs=-1,random_state=seed,average=True,\
                          learning_rate='invscaling',power_t=0.5,eta0=4.0)

    elif classifier == 'xgb':
        params = {'learning_rate': 0.1, 'n_estimators': 100, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1, 'tree_method': 'hist'}
        model = XGBClassifier(**params)

    elif classifier == 'lgb':
        
        # set parameters
        params = {}
        params['boosting_type'] = 'gbdt'
        params['objective'] = 'binary'
        params['metric'] = 'binary_logloss'
        # params['num_iterations'] = 100 # default
        params['feature_fraction'] = 1.
        params['bagging_fraction'] = 1.
        params['nthreads'] = 8
        # params['scale_pos_weight'] = 1 #positive_class_fraction
        params['is_unbalance'] = False
        params['max_bin'] = 2^12
        params['n_estimators'] = 300
        #params['categorical_feature'] = ['hour','dayoftheweek','day','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id',\
        #'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']
        
        # parameter grid to use with cross-validation
        param_grid = {}
        param_grid['min_data_in_leaf'] = [30] 
        param_grid['max_depth'] = [-1] 
        param_grid['learning_rate'] = [0.03]
        param_grid['min_data_per_group'] = [5]
        param_grid['num_leaves'] = [100] # <= 2**max_depth
        param_grid['regression_l2'] = [0.]
       
        # build model
        model = LGBMClassifier(**params)
    
    else:
        raise Exception('Must specify classifier!')


    # train classifier: iterate over batches
    
    # initialize
    chunk=0
    
    # set up reader object for data input
    # input data file has previously been randomly shuffled
    reader = pd.read_table(train_file, sep=',', chunksize=chunk_size, names=header,header=None,\
           nrows = n_rows)

    for data in reader:

        # increment chunk counter
        chunk+=1

        # get next batch and preprocess
        #y_train, X_train, data = hash_features(data)
        y_train, X_train = hash_features(data)
        
        # data = data.apply(lambda x: x.astype('category'))
        
        # get positive class fraction
        # positive_class_fraction = sum(y_train==1)/len(y_train)
        
        # fraction of data to use for training -- rest is for evaluation
        train_fraction = 0.9
        
        # dev set for cross-validation: first train_fraction of training set
        y_dev = y_train[:int(train_fraction*n_rows)]
        X_dev = X_train[:int(train_fraction*n_rows),:]
        #X_dev = data.iloc[:int(train_fraction*n_rows),:]
    
        # test set for model evaluation: remaining 1 - train_fraction of training set
        y_test = y_train[int(train_fraction*n_rows):]
        X_test = X_train[int(train_fraction*n_rows):,:]
        # X_test = data.iloc[int(train_fraction*n_rows):,:]
        
        # Tune the hyperparameters by cross-validation
        
        print('Tuning hyper-parameters\n\n')
        clf = GridSearchCV(model, param_grid=param_grid, cv=5,
                           scoring='neg_log_loss')
        clf.fit(X_dev, y_dev)

        print('Best parameters set found on development set\n')
        print(clf.best_params_)
        print('\nGrid scores on development set:\n')
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))

        print('\nDetailed classification report:\n')
        print('The model is trained on the full development set {} % of the training samples'.format(100*train_fraction))
        print('The scores are computed on the full evaluation set {} % of the training samples\n'.format(100*(1-train_fraction)))
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        
        # predict targets
        y_prob = clf.best_estimator_.predict_proba(X_test)

        # estimate log_loss
        logloss = log_loss((y_test+1), y_prob)
    
        # Show progress
        print('elapsed time: %s, chunk: %d, log_loss: %f' % (str(datetime.now() - start), chunk, logloss))
    
        # archive the model file from training
        # !!!!! caution: model_file may be ~ GBs
        archive_model = True
        if(archive_model):
            model_file = model_path+'model-avazu-sgd.pkl'
            joblib.dump(clf.best_estimator_, model_file)

            # archive the preprocessing file from training
            preproc_file=model_path+'model-avazu-preproc.pkl'
            joblib.dump(preproc, preproc_file)


Tuning hyper-parameters


Best parameters set found on development set

{'learning_rate': 0.03, 'max_depth': -1, 'min_data_in_leaf': 30, 'min_data_per_group': 5, 'num_leaves': 100, 'regression_l2': 0.0}

Grid scores on development set:

-0.401 (+/-0.001) for {'learning_rate': 0.03, 'max_depth': -1, 'min_data_in_leaf': 30, 'min_data_per_group': 5, 'num_leaves': 100, 'regression_l2': 0.0}

Detailed classification report:

The model is trained on the full development set 90.0 % of the training samples
The scores are computed on the full evaluation set 9.999999999999998 % of the training samples



  if diff:


             precision    recall  f1-score   support

          0       0.84      0.99      0.91     24913
          1       0.62      0.07      0.13      5087

avg / total       0.80      0.84      0.78     30000

elapsed time: 0:04:07.954182, chunk: 1, log_loss: 0.398216
