In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [4]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']

Brand features: train shape (74645, 131), test shape (112071, 131)


In [5]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [6]:
gatrain['nid_order'] = gatrain.trainrow/74645.0
gatest['nid_order'] = gatest.testrow/112071.0

In [34]:
gatrain['to_group'] = gatrain.gender.astype(str)+'.'+gatrain.age.astype(str)+'.'+gatrain.brand.astype(str)+'.'+gatrain.model.astype(str)

In [36]:
gatrain.to_group.unique().shape

(22286,)

In [50]:
def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

In [79]:
b = (consecutive(gatrain.groupby('to_group').get_group('M.35.51.843').row))

In [89]:
c = []
for a in b:
    if a.shape[0]>1:
        c.append(a)

In [94]:
dfff = gatrain.groupby('to_group').get_group('M.35.51.843')

In [122]:
dfff.ix[c[2].index].age.unique()[0]

35

In [92]:
c[2]

device_id
 7730336322330977347    30249
-7930822997577387097    30250
-2388870181361346627    30251
-1868895392123057059    30252
-5173448531969194156    30253
 3611133061034895936    30254
Name: row, dtype: int64

In [None]:
a = np.array([0, 47, 48, 49, 50, 97, 98, 99])
consecutive(a)

In [None]:
def fn(df):
    

In [49]:
gatrain.groupby('to_group').get_group('M.35.51.843')

Unnamed: 0_level_0,gender,age,group,row,brand,model,nid_order,to_group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-8076087639492063270,M,35,M32-38,0,51,843,0.0,M.35.51.843
-2897161552818060146,M,35,M32-38,1,51,843,1.3e-05,M.35.51.843
-8260683887967679142,M,35,M32-38,2,51,843,2.7e-05,M.35.51.843
-3004255342038425026,M,35,M32-38,1503,51,843,0.020135,M.35.51.843
-3856099721241843282,M,35,M32-38,7834,51,843,0.10495,M.35.51.843
3785807096413589862,M,35,M32-38,19063,51,843,0.255382,M.35.51.843
-6817867161853542033,M,35,M32-38,19064,51,843,0.255396,M.35.51.843
-5912925350327253443,M,35,M32-38,25616,51,843,0.343171,M.35.51.843
-6583754955843663064,M,35,M32-38,27909,51,843,0.37389,M.35.51.843
7730336322330977347,M,35,M32-38,30249,51,843,0.405238,M.35.51.843


In [45]:
gatrain.rename(columns = {'trainrow':'row'},inplace=True)
gatest.rename(columns = {'testrow':'row'},inplace=True)

In [48]:
gatrain.head(3)

Unnamed: 0_level_0,gender,age,group,row,brand,model,nid_order,to_group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-8076087639492063270,M,35,M32-38,0,51,843,0.0,M.35.51.843
-2897161552818060146,M,35,M32-38,1,51,843,1.3e-05,M.35.51.843
-8260683887967679142,M,35,M32-38,2,51,843,2.7e-05,M.35.51.843


In [16]:
gatrain[(gatrain.brand==51)&(gatrain.model==865)&(gatrain.gender=='F')&(gatrain.age==23)].sort('trainrow')

  if __name__ == '__main__':


Unnamed: 0_level_0,gender,age,group,trainrow,brand,model,nid_order
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1514522073062744223,F,23,F23-,1747,51,865,0.023404
6470880409257242515,F,23,F23-,1748,51,865,0.023418
-4419870558975286229,F,23,F23-,2337,51,865,0.031308
5337062131783056211,F,23,F23-,2338,51,865,0.031322
1468360098941880503,F,23,F23-,8435,51,865,0.113002
-7545531996316504574,F,23,F23-,14189,51,865,0.190086
-4592269048894431292,F,23,F23-,20168,51,865,0.270186
-5170075361247382687,F,23,F23-,21862,51,865,0.29288
7307915610732339838,F,23,F23-,28550,51,865,0.382477
1875747693228750338,F,23,F23-,33671,51,865,0.451082


In [137]:
aa = gatest[(gatest.brand==51)&(gatest.model==865)&(gatest.nid_order>0.022404)&(gatest.nid_order<0.024404)].sort('row')

  if __name__ == '__main__':


In [142]:
aa

Unnamed: 0_level_0,row,brand,model,nid_order
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-3623170018309118233,2525,51,865,0.02253
2487686740931125480,2586,51,865,0.023075
5088294672372642064,2615,51,865,0.023333
546580604917490299,2633,51,865,0.023494
-7244054779945689026,2634,51,865,0.023503
-8143541372720077385,2635,51,865,0.023512
6720146428478292796,2636,51,865,0.023521
-7149135054462404365,2637,51,865,0.02353
-8639621067774183392,2693,51,865,0.024029
2995406478677868061,2702,51,865,0.02411


In [134]:
minn = []
indd = []

In [141]:
n = [20, 15, 2, 20]
n.index(min(n))

2

In [57]:
gatest[(gatest.brand==51)&(gatest.model==865)]

Unnamed: 0_level_0,testrow,brand,model,nid_order
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
289797889702373958,6,51,865,0.000054
-402874006399730161,7,51,865,0.000062
9097600484609173263,11,51,865,0.000098
2693830763629647823,39,51,865,0.000348
1204441603415701350,110,51,865,0.000982
-8238621464594527487,114,51,865,0.001017
-909754136142754671,146,51,865,0.001303
6401864917272734079,167,51,865,0.001490
-4853711190831052408,175,51,865,0.001562
7819211934480182530,231,51,865,0.002061


In [None]:
Xtrain = hstack((Xtr_brand, Xtr_model), format='csr')
Xtest =  hstack((Xte_brand, Xte_model), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

In [143]:
bb = aa

In [145]:
bb['gender'] = 'X'
bb['age'] = 0
bb

Unnamed: 0_level_0,row,brand,model,nid_order,gender,age
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-3623170018309118233,2525,51,865,0.02253,X,0
2487686740931125480,2586,51,865,0.023075,X,0
5088294672372642064,2615,51,865,0.023333,X,0
546580604917490299,2633,51,865,0.023494,X,0
-7244054779945689026,2634,51,865,0.023503,X,0
-8143541372720077385,2635,51,865,0.023512,X,0
6720146428478292796,2636,51,865,0.023521,X,0
-7149135054462404365,2637,51,865,0.02353,X,0
-8639621067774183392,2693,51,865,0.024029,X,0
2995406478677868061,2702,51,865,0.02411,X,0


In [155]:
bb.ix[[6720146428478292796,2995406478677868061]].gender
bb.ix[[6720146428478292796,2995406478677868061]].age = 24

In [162]:
bb[bb.index in [6720146428478292796,2995406478677868061]]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [163]:
cc = pd.Series('X',index=bb.index)

In [165]:
dd = pd.Series(0,index=bb.index)

In [169]:
cc[[6720146428478292796,2995406478677868061]] = 'F'
dd[[6720146428478292796,2995406478677868061]] = 24

In [171]:
dd

device_id
-3623170018309118233     0
 2487686740931125480     0
 5088294672372642064     0
 546580604917490299      0
-7244054779945689026     0
-8143541372720077385     0
 6720146428478292796    24
-7149135054462404365     0
-8639621067774183392     0
 2995406478677868061    24
-7537310668694706565     0
-5127371964891533947     0
 8076883275986912427     0
 3207012926517067353     0
dtype: int64

In [172]:
df = pd.DataFrame({'a':[1, 2], 'b':[3, 4]})  # see EDIT below
s = pd.Series({'s1':5, 's2':6})

In [173]:
df

Unnamed: 0,a,b
0,1,3
1,2,4


In [174]:
s

s1    5
s2    6
dtype: int64