In [55]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [44]:
gender_age_train = pd.read_csv('./input/gender_age_train.csv', index_col='device_id')

In [45]:
nparray = np.load('intermediates/brand_train.npz')
brand_train = csr_matrix((\
                          nparray['data'], \
                          nparray['indices'], \
                          nparray['indptr']),\
                         shape=(gender_age_train.shape[0],131))

In [46]:
nparray = np.load('intermediates/device_train.npz')
device_train = csr_matrix((\
                          nparray['data'], \
                          nparray['indices'], \
                          nparray['indptr']),\
                         shape=(gender_age_train.shape[0],1599))

In [47]:
nparray = np.load('intermediates/apps_train.npz')
app_train = csr_matrix((\
                          nparray['data'], \
                          nparray['indices'], \
                          nparray['indptr']),\
                         shape=(gender_age_train.shape[0],19237))

In [48]:
nparray = np.load('intermediates/label_train.npz')
label_train = csr_matrix((\
                          nparray['data'], \
                          nparray['indices'], \
                          nparray['indptr']),\
                         shape=(gender_age_train.shape[0],492))

In [49]:
Xtrain = hstack((brand_train, device_train, app_train, label_train), format='csr')

In [50]:
Xtrain.shape

(74645, 21459)

In [51]:
gender_age_train.head()

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [52]:
result_encoder = LabelEncoder().fit(gender_age_train.group)
y = result_encoder.transform(gender_age_train.group)
nclasses = len(result_encoder.classes_)

In [53]:
def score(clf):
    kf = StratifiedKFold(y, n_folds=5, shuffle=True)
    pred = np.zeros((y.shape[0], nclasses))
    for itrain, itest in kf:
        Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :]
        ytr, yte = y[itrain], y[itest]
        clf.fit(Xtr, ytr)
        pred[itest, :] = clf.predict_proba(Xte)
    return log_loss(y, pred)

In [56]:
from sklearn.linear_model import LogisticRegression
score(LogisticRegression(C=0.02))

2.2806541552022601

In [57]:
score(LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs'))

2.27373502790073