In [27]:
import random
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler

%matplotlib inline


### Goal of work sheet
- Optimize linear models for training on brand / device
- V1: Select best data for model (onehotencoded both, separate, or just one)
    - Brands onehot, device label: 2.40215702862 (five seeds: 2.4017680305744418)
    - Seperate onehot encoded:  2.39089229204 (five seeds: 2.3901993860642716)
    - Combined onehot encoded: 2.39583615824 (five seeds: 2.3952347531028977)
- V2: Used couple of creative features based on brand and device model, also included brand when encoded device
    - Without scaler: 2.39122949113 (five seeds:)
    - With scaler:

In [28]:
data_dir = './data_ori/'
feat_dir = './data/'

In [43]:
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        for c in X.columns:
            if X[c].max()>1:
                X[c] = StandardScaler().fit_transform(X)
            
        #print X.shape
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(feat_dir, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)
            
feature_files = ['features_brand_bag',
                 'features_brand_model_bag',
                 'features_brand_model.csv']

### Loading and preparing data

In [44]:
gatrain = pd.read_csv('./data_ori/gender_age_train.csv')
gatest = pd.read_csv('./data_ori/gender_age_test.csv')
#train = pd.merge(gatrain, brand, on='device_id', how='inner')

In [45]:
#print train.shape
#train.head()

In [46]:
Xtrain = hstack([open_feature_file(f) for f in feature_files], format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files], format='csr')
y = gatrain['group']

In [48]:
print 'X', Xtrain.shape
print 'y', y.shape
#X.head()

 X (74645, 1803)
y (74645,)


In [49]:
letarget = LabelEncoder().fit(y)
y = letarget.transform(y)
n_classes = len(letarget.classes_)

## Linear regresion

In [52]:
# Gridsearch SGD Classifier
rs = 123
kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=rs)

log = LogisticRegression()
param_grid = {'C': np.linspace(0.07,0., 'penalty': ['l2']}
clf = GridSearchCV(log, param_grid, scoring='log_loss', n_jobs=5, cv=kf, verbose=10)
clf.fit(Xtrain, y)
print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
print "With parameters:"
    
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name]) 

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] penalty=l2, C=0.05 ..............................................
[CV] penalty=l2, C=0.05 ..............................................
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.392085 -  18.4s
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.394972 -  18.6s
[CV] .................... penalty=l2, C=0.05, score=-2.393203 -  19.0s


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   19.0s


[CV] penalty=l2, C=0.05 ..............................................
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.389851 -  18.7s
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.389458 -  18.6s
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.394062 -  19.1s
[CV] penalty=l2, C=0.05 ..............................................
[CV] .................... penalty=l2, C=0.05, score=-2.393676 -  18.3s
[CV] penalty=l2, C=0.05 ..............................................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   56.0s


[CV] .................... penalty=l2, C=0.05, score=-2.392188 -  19.0s
[CV] penalty=l2, C=0.1 ...............................................
[CV] .................... penalty=l2, C=0.05, score=-2.396310 -  18.9s
[CV] penalty=l2, C=0.1 ...............................................
[CV] .................... penalty=l2, C=0.05, score=-2.390236 -  19.5s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.390634 -  21.8s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.391585 -  23.0s


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.3min


[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.393050 -  22.2s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.388310 -  22.0s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.388095 -  22.4s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.392278 -  23.9s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.392762 -  24.4s
[CV] penalty=l2, C=0.1 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.390790 -  23.8s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.1, score=-2.395926 -  21.3s
[CV] p

[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  2.4min


[CV] ..................... penalty=l2, C=0.1, score=-2.388868 -  21.8s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.391834 -  25.8s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.391251 -  27.5s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.392356 -  27.5s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.387976 -  27.1s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.387883 -  27.7s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.391677 -  27.8s
[CV] penalty=l2, C=0.2 ...............................................


[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  3.4min


[CV] ..................... penalty=l2, C=0.2, score=-2.393337 -  27.0s
[CV] penalty=l2, C=0.2 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.390518 -  27.4s
[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.397333 -  26.9s
[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.2, score=-2.388844 -  28.2s
[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.3, score=-2.392474 -  30.4s
[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.3, score=-2.392726 -  30.6s
[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.3, score=-2.393019 -  29.0s
[CV] penalty=l2, C=0.3 ...............................................
[CV] .

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  4.8min


[CV] penalty=l2, C=0.3 ...............................................
[CV] ..................... penalty=l2, C=0.3, score=-2.392073 -  28.4s
[CV] penalty=l2, C=0.3 ...............................................


[Parallel(n_jobs=3)]: Done  41 out of  40 | elapsed:  4.9min remaining:   -7.1s


[CV] ..................... penalty=l2, C=0.3, score=-2.394745 -  29.7s
[CV] penalty=l2, C=0.3 ...............................................


[Parallel(n_jobs=3)]: Done  41 out of  40 | elapsed:  5.3min remaining:   -7.7s


[CV] ..................... penalty=l2, C=0.3, score=-2.391605 -  30.4s


[Parallel(n_jobs=3)]: Done  41 out of  40 | elapsed:  5.3min remaining:   -7.8s


[CV] ..................... penalty=l2, C=0.3, score=-2.398917 -  29.8s


[Parallel(n_jobs=3)]: Done  41 out of  40 | elapsed:  5.4min remaining:   -7.9s


[CV] ..................... penalty=l2, C=0.3, score=-2.389977 -  19.4s


[Parallel(n_jobs=3)]: Done  41 out of  40 | elapsed:  5.6min remaining:   -8.2s
[Parallel(n_jobs=3)]: Done  40 out of  40 | elapsed:  5.6min finished


Best score:-2.39122949113 with scorer make_scorer(log_loss, greater_is_better=False, needs_proba=True)
With parameters:
	C: 0.1
	penalty: 'l2'


In [None]:
scores = {}
for s in [0, 12, 123, 1234, 12345]:
    kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=s)
    pred_l = np.zeros((Xtrain.shape[0],n_classes))
    c=1
    for itrain, itest in kf:
        print('%d / %d' % (c, 10))
        ytrain, ytest = y[itrain], y[itest]
        xg_train = Xtrain[itrain, :]
        xg_test = Xtrain[itest, :]
        clf = LogisticRegression(C=0.1, penalty='l2')
        clf.fit(xg_train, ytrain)
        pred_l[itest,:] = clf.predict_proba(xg_test)
        c+=1
    print log_loss(y, pred_l)
    scores[s] = pred_l

1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39077973471
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39090123898
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39122949113
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39101582789
1 / 10
2 / 10
3 / 10


In [66]:
log_loss(y,sum(scores.values()))

2.3952347531028977