In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.datasets import dump_svmlight_file

# Load Data

In [3]:
datadir = 'C:\hudsondata\Machine Learning\TalkingData'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [4]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

# One Hot Encode Brand

In [5]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print 'The shape of Xtr_brand is', Xtr_brand.shape, 'the shape of Xte_brand is', Xte_brand.shape

The shape of Xtr_brand is (74645, 131) the shape of Xte_brand is (112071, 131)


In [6]:
groupencoder = LabelEncoder().fit(gatrain['group'])
gatrain['classes'] = groupencoder.transform(gatrain['group'])

# One Hot Encode Model

In [7]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print 'The shape of Xtr_model is', Xtr_model.shape, 'the shape of Xte_model is', Xte_model.shape

The shape of Xtr_model is (74645, 1667) the shape of Xte_model is (112071, 1667)


# One Hot Encode App ID's 

In [8]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,


In [9]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print 'The shape of Xtr_app is', Xtr_app.shape, 'the shape of Xte_app is', Xte_app.shape

The shape of Xtr_app is (74645, 19237) the shape of Xte_app is (112071, 19237)


In [10]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

# One Hot Encode App Labels

In [11]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [12]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print 'The shape of Xtr_label is', Xtr_label.shape, 'the shape of Xte_label is', Xte_label.shape

The shape of Xtr_label is (74645, 492) the shape of Xte_label is (112071, 492)


In [13]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)

# Feature Engineering - Tfidf App Labels

In [15]:
tfidf = TfidfTransformer()
devicelabelstfidf = devicelabels.groupby(['device_id','label'])['size'].agg(['sum']).unstack().fillna(0)
transformedlabels = tfidf.fit_transform(devicelabelstfidf)
transformedlabels = pd.DataFrame(transformedlabels.toarray())
dev_id = devicelabels.groupby('device_id')['size'].size().reset_index()
dev_id.drop(0,1,inplace=True)
transformedlabels = dev_id.join(transformedlabels)
transformedlabels = transformedlabels.merge(gatrain.reset_index()[['trainrow','device_id']], how='left',on='device_id').merge(gatest.reset_index()[['testrow','device_id']], how='left', on='device_id')

In [16]:
f = transformedlabels.dropna(subset=['trainrow'])
f.drop(['testrow','device_id'], axis=1, inplace=True)
f.set_index('trainrow',inplace=True)
f.sort_index(inplace=True)
new_index=np.arange(0,74645)
f = f.reindex(new_index).fillna(0)
Xtr_tfidflabel = csr_matrix(f)
g=transformedlabels.dropna(subset=['testrow'])
g.drop(['trainrow','device_id'], axis=1, inplace=True)
g.set_index('testrow',inplace=True)
g.sort_index(inplace=True)
new_index = np.arange(0,112071)
g = g.reindex(new_index).fillna(0)
Xte_tfidflabel = csr_matrix(g)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
events['hour'] = events['timestamp'].map(lambda x:pd.to_datetime(x).hour)
events['hourbin'] = [1 if ((x>=1)&(x<=6)) else 2 if ((x>=7)&(x<=12)) else 3 if ((x>=13)&(x<=18)) else 4 for x in events['hour']]

# Feature Engineering - Tfidf App Usage Hour

In [20]:
tfidf = TfidfTransformer()
hourbintfidf = events.groupby(['device_id','hourbin'])['hourbin'].agg(['size']).unstack().fillna(0)
hourbintfidf = tfidf.fit_transform(hourbintfidf)
hourbintfidf = pd.DataFrame(hourbintfidf.toarray())
dev_id = events.groupby('device_id').size().reset_index()
dev_id.drop(0,1,inplace=True)
hourbintfidf = dev_id.join(hourbintfidf)
hourbintfidf = hourbintfidf.merge(gatrain.reset_index()[['trainrow','device_id']], how='left',on='device_id').merge(gatest.reset_index()[['testrow','device_id']], how='left', on='device_id')

In [21]:
f = hourbintfidf.dropna(subset=['trainrow'])
f.drop(['testrow','device_id'], axis=1, inplace=True)
f.set_index('trainrow',inplace=True)
f.sort_index(inplace=True)
new_index=np.arange(0,74645)
f = f.reindex(new_index).fillna(0)
Xtr_tfidfhourbin = csr_matrix(f)
g=hourbintfidf.dropna(subset=['testrow'])
g.drop(['trainrow','device_id'], axis=1, inplace=True)
g.set_index('testrow',inplace=True)
g.sort_index(inplace=True)
new_index = np.arange(0,112071)
g = g.reindex(new_index).fillna(0)
Xte_tfidfhourbin = csr_matrix(g)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Stack All Features

In [39]:
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label,Xtr_tfidfhourbin,Xtr_tfidflabel), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label,Xte_tfidfhourbin,Xte_tfidflabel), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 22023), test shape (112071, 22023)


In [43]:
selector = SelectKBest(chi2, k=8000).fit(Xtrain, y)

In [44]:
Xtrainkb = selector.transform(Xtrain)
Xtestkb = selector.transform(Xtest)

In [45]:
print 'Xtrainkb shape is', Xtrainkb.shape, 'Xtestkb shape is', Xtestkb.shape

Xtrainkb shape is (74645, 8000) Xtestkb shape is (112071, 8000)


In [46]:
def score(clf, random_state = 0):
    kf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=random_state)
    pred = np.zeros((y.shape[0],nclasses))
    for itrain, itest in kf:
        Xtr, Xte = Xtrainkb[itrain, :], Xtrainkb[itest, :]
        ytr, yte = y[itrain], y[itest]
        clf.fit(Xtr, ytr)
        pred[itest,:] = clf.predict_proba(Xte)
        # Downsize to one fold only for kernels
        return log_loss(yte, pred[itest, :])
    print('')
    return log_loss(y, pred)

In [47]:
score(LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs'))

2.2710716974370682