# Train level 0: Bayesian model
Based on:
https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/brand-and-model-based-benchmarks/comments

- V0: Uses 10-fold CV 
- V1: Uses predefined dataset & combined models

In [63]:
import random
import os
import datetime
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression

%matplotlib inline


In [60]:
dir_in = 'data_ori'
dir_feat = 'data'
dir_out = 'model_0_bayes'

description = 'models_bayes_0_V1'

## Load data

In [4]:
gatrain = pd.read_csv(os.path.join(dir_in, 'gender_age_train.csv'))
gatest = pd.read_csv(os.path.join(dir_in, 'gender_age_test.csv'))


In [5]:
letarget = LabelEncoder().fit(gatrain.group.values)
y = letarget.transform(gatrain.group.values)
n_classes = len(letarget.classes_)

In [6]:
phone = pd.read_csv('./data_ori/phone_brand_device_model.csv',encoding='utf-8')
phone = phone.drop_duplicates('device_id', keep='first')
phone.head(3)

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4


In [7]:
lebrand = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = lebrand.transform(phone.phone_brand)
m = phone.phone_brand.str.cat(phone.device_model)
lemodel = LabelEncoder().fit(m)
phone['model'] = lemodel.transform(m)

Xtrain = gatrain.merge(phone[['device_id','brand','model']], how='left',on='device_id')
Xtest = gatest.merge(phone[['device_id','brand','model']], how='left',on='device_id')
Xtrain.head(3)

Unnamed: 0,device_id,gender,age,group,brand,model
0,-8076087639492063270,M,35,M32-38,51,843
1,-2897161552818060146,M,35,M32-38,51,843
2,-8260683887967679142,M,35,M32-38,51,843


In [8]:
# Load CV sets
train_cv = pd.read_csv(os.path.join(dir_in, 'gender_age_train_cv.csv'))
test_cv = pd.read_csv(os.path.join(dir_in, 'gender_age_test_cv.csv'))

In [45]:
X_train, X_val = Xtrain.loc[train_cv.sample_nr, :], Xtrain.loc[test_cv.sample_nr, :]
y_train, y_val = y[train_cv.sample_nr], y[test_cv.sample_nr]

In [46]:
X_train.sample(10)

Unnamed: 0,device_id,gender,age,group,brand,model
1492,-6939393211204340051,F,22,F23-,13,264
42613,4707300503949486411,M,30,M29-31,51,865
21842,-1456321005371064978,M,45,M39+,13,214
12212,-9158734172103205235,F,22,F23-,7,160
27235,3570714957564079372,M,43,M39+,31,730
31731,1585261055807865350,F,37,F33-42,51,859
22800,1886824808792789562,F,41,F33-42,18,493
55268,-9215766592714662253,F,34,F33-42,117,1537
30518,-1516830556977021558,F,24,F24-26,31,730
67501,8724946949735794672,M,23,M23-26,31,665


## Calculate and store model

In [52]:
class GenderAgeGroupProbCombined(object):
    def __init__(self, prior_weight=(30,20), w=(1,1.3), by=('brand', 'model')):
        self.prior_weight_brand = prior_weight[0]
        self.prior_weight_model = prior_weight[1]
        
        self.w_brand = w[0]
        self.w_model = w[1]
        
        self.by_brand = by[0]
        self.by_model = by[1]
    
    def fit(self, df):
        
        self.prior_brand = df['group'].value_counts().sort_index()/df.shape[0]
        self.prior_model = df['group'].value_counts().sort_index()/df.shape[0]
        
        c_brand = df.groupby([self.by_brand, 'group']).size().unstack().fillna(0)
        c_model = df.groupby([self.by_model, 'group']).size().unstack().fillna(0)
        
        self.prob_brand = (c_brand.add(self.prior_weight_brand*self.prior_brand)).div(c_brand.sum(axis=1)+self.prior_weight_brand, axis=0)
        self.prob_model = (c_model.add(self.prior_weight_model*self.prior_model)).div(c_model.sum(axis=1)+self.prior_weight_model, axis=0)
        
        return self
    
    def predict_proba(self, df):
        pred_brand = df[[self.by_brand]].merge(self.prob_brand, 
                                   how='left', 
                                   left_on=self.by_brand, 
                                   right_index=True).fillna(self.prior_brand)[self.prob_brand.columns]
        pred_model = df[[self.by_model]].merge(self.prob_model, 
                                   how='left', 
                                   left_on=self.by_model, 
                                   right_index=True).fillna(self.prior_model)[self.prob_model.columns]
        
        pred_brand.loc[pred_brand.iloc[:,0].isnull(),:] = self.prior_brand
        pred_model.loc[pred_model.iloc[:,0].isnull(),:] = self.prior_model
        return ((pred_brand*self.w_brand + pred_model*self.w_model) / (self.w_brand + self.w_model)).values

In [53]:
n_models = 10
rs = 50

params = {'prior_weight': (30,20),
          'weight': (1, 1.3),
           'seed': rs}

In [57]:
clf = GenderAgeGroupProbCombined(params['prior_weight'], params['weight']).fit(X_train)
pred = clf.predict_proba(X_val)
cv_score= log_loss(y_val, pred)

models_out = [{'model': clf,
             'score': cv_score,
             'params': params}]
print cv_score

2.38916631078


In [64]:
outputfile = '{}_{}_{:.4f}_{:.4f}.pickle'.format(description, 
                                                 datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"),
                                                 cv_score,
                                                 -1)

output = {'script': 'train_model_0_bayesian',
          'features': None,
          'feature_sets': 'phone_brand_device_model.csv',
          'model_params': params,
          'no_models': n_models,
          'cross_validation': {'type': 'gender_age_train_cv.csv'},
          'models': models_out}


with open(os.path.join(dir_out, outputfile), 'wb') as f:
    pickle.dump(output,f)