In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import datetime
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.sparse import hstack,csr_matrix
import random
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = 'C:\\hudsondata\\Machine Learning\\Redhat'
people = pd.read_csv(os.path.join(file_path,'people.csv'), parse_dates = ['date'], dtype={'people_id':np.str, 'char_38':np.int})
train = pd.read_csv(os.path.join(file_path,'act_train.csv'), parse_dates = ['date'], dtype={'people_id':np.str,'activity_id':np.str})
test = pd.read_csv(os.path.join(file_path,'act_test.csv'), parse_dates = ['date'], dtype={'people_id':np.str, 'activity_id':np.str})

# Processing Files

In [3]:
def peoplefile(ppl):
    ppl = ppl.replace({'ppl_':'', 'type ':'', 'group ':''},regex=True)
    bools = ppl.select_dtypes(include=['bool']).columns
    ppl[bools] = ppl[bools].astype(int)
    ppl.drop('char_1', 1, inplace=True)
    return ppl

In [4]:
people = peoplefile(people)

In [5]:
def activity(activityfile):
    activityfile = activityfile.replace({'ppl_':'','type ':''},regex=True)
    activityfile.drop('char_10', 1, inplace=True)
    activityfile.fillna(0,inplace=True)
    return activityfile

In [6]:
train = activity(train)
test = activity(test)

In [7]:
train = train.sort_values(by='people_id')
test = test.sort_values(by='people_id')
people = people.sort_values(by='people_id')

In [8]:
people['newcol']=0
people.ix[150570:,'newcol'] = 1
train = train.merge(people, on='people_id',how='left')
test = test.merge(people,on='people_id',how='left')
y = train['outcome']

# Reduce Dimensions for One-Hot-Encoding and  Label Encoding

In [9]:
columns = list(test.columns)
whole = pd.concat([train[columns],test])

In [10]:
categorical=['group_1','activity_category','char_1','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x',
             'char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']

In [11]:
def reduce_dimen(dataset,column,toreplace):
    for index, i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset

In [56]:
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)

In [13]:
for category in categorical:
    le = LabelEncoder().fit(whole[category])
    whole[category] = le.transform(whole[category])

# Doing a LabelKFold Using People ID

In [14]:
from sklearn.cross_validation import LabelKFold

In [15]:
labels = train['people_id']
def score(clf,Xtrain,y):
    scores =[]
    labelkfold = LabelKFold(labels,n_folds=5)
    pred = np.zeros((y.shape[0],2))
    for itrain, itest in labelkfold:
        Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :]
        ytr, yte = y[itrain], y[itest]
        clf.fit(Xtr, ytr)
        pred[itest,:] = clf.predict_proba(Xte)        
        # Downsize to one fold only for kernels
        rocscore = roc_auc_score(yte, pred[itest, :][:,1])
        scores.append(rocscore)
    return sum(scores) / float(len(scores))

# One-Hot-Encoding Original Categorical Features

In [16]:
enc = OneHotEncoder(handle_unknown='ignore')
enc =enc.fit(whole[categorical])

In [59]:
train = whole.iloc[:2197291]
test = whole.iloc[2197291:]

In [18]:
train_sparse = enc.transform(train[categorical])
test_sparse = enc.transform(test[categorical])

In [185]:
logreg = LogisticRegression(solver='sag')

In [41]:
score(logreg,train_sparse,y)

0.97368368536209027

# Adding Non-Categorical Original Features

In [20]:
not_categorical = []
for category in train.columns:
    if category not in categorical:
        not_categorical.append(category)

In [21]:
toremove = ['people_id','activity_id','date_x','date_y','char_38','newcol']
for char in toremove:
    not_categorical.remove(char)

In [54]:
Xtrain = hstack((train_sparse, train[not_categorical]),format='csr')
Xtest = hstack((test_sparse, test[not_categorical]),format='csr')

In [55]:
score(logreg,Xtrain,y)

0.97369281076812175

# Looking at High Correlation Non-Categorical Original Features

In [22]:
trainwithoutcome = train.join(y)
trainwithoutcome.drop('newcol',1,inplace=True)

In [23]:
correlation =dict()
for char in not_categorical:
    correlation[char] = np.corrcoef(trainwithoutcome[[char,'outcome']].T)[0,1]

In [24]:
correlation = pd.DataFrame(correlation.items())
correlation.columns = ['feature','corr']
correlation.head()

Unnamed: 0,feature,corr
0,char_19,0.279465
1,char_18,0.211444
2,char_31,0.253803
3,char_30,0.210634
4,char_15,0.263966


In [25]:
correlation = correlation.sort_values(by='corr', ascending=False)

In [26]:
highcorrnoncat = list(correlation['feature'][:5])

In [89]:
Xtrain = hstack((train_sparse, train[highcorrnoncat]),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat]),format='csr')

In [90]:
score(logreg,Xtrain,y)

0.97376969836905825

# Looking at High Correlation Categorical Features

In [127]:
correlation2 =dict()
for char in categorical:
    correlation2[char] = np.corrcoef(trainwithoutcome[[char,'outcome']].T)[0,1]

In [128]:
correlation2 = pd.DataFrame(correlation2.items())
correlation2.columns = ['feature','corr']
correlation2.head()

Unnamed: 0,feature,corr
0,group_1,0.366091
1,char_3_x,-0.012976
2,char_2_y,0.351574
3,char_4_x,-0.016641
4,char_4_y,0.003123


In [129]:
correlation2 = correlation2.sort_values('corr',ascending=False)
highcorrcat = list(correlation2['feature'][:5])

In [96]:
enc2 = OneHotEncoder(handle_unknown='ignore')
enc2 =enc.fit(whole[highcorrcat])

In [97]:
train_sparse2 = enc.transform(train[highcorrcat])
test_sparse2 = enc.transform(test[highcorrcat])

In [98]:
Xtrain = hstack((train_sparse2, train[highcorrnoncat]),format='csr')
Xtest = hstack((test_sparse2, test[highcorrnoncat]),format='csr')

In [99]:
score(logreg,Xtrain,y)



0.97373191330553843

In [100]:
print 'After the results, we will continue to use the original categorical features and the high correlation non-categorical ones'

After the results, we will continue to use the original categorical features and the high correlation non-categorical ones


# Adding Standardized char_38 to the mix

In [27]:
stdscaler = StandardScaler()
char38 = stdscaler.fit_transform(whole['char_38'].reshape(-1,1))
char38 = pd.DataFrame(char38)



In [28]:
trainchar38 = char38[:2197291]
testchar38 = char38[2197291:]

In [104]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38),format='csr')

In [105]:
score(logreg,Xtrain,y)

0.97715652872986214

# Feature Engineering 1 - Standardized Count of Activity per Group/Person

Count of Activity per Group

In [29]:
actcount = train.groupby('group_1')['activity_category'].count().reset_index()
actcount.columns = ['group_1','actcount']
actcounttrain = pd.DataFrame(train.merge(actcount, on = 'group_1', how='left' )['actcount'].reset_index(drop=True))

In [30]:
actcounttest = test.groupby('group_1')['activity_category'].size().reset_index()
actcounttest.columns = ['group_1','actcount']
actcounttest = pd.DataFrame(test.merge(actcounttest, on='group_1', how='left')['actcount'].reset_index(drop=True))

In [102]:
stdscaler = StandardScaler()
actcounttrain = pd.DataFrame(stdscaler.fit_transform(actcounttrain['actcount'].astype('float').reshape(-1,1)))
stdscaler = StandardScaler()
actcounttest = pd.DataFrame(stdscaler.fit_transform(actcounttest['actcount'].astype('float').reshape(-1,1)))

Count of Activity per Person

In [104]:
actcountppl = train.groupby('people_id')['activity_category'].count().reset_index()
actcountppl.columns = ['people_id','actcount']
actcountppl = pd.DataFrame(train.merge(actcountppl, on = 'people_id', how='left' )['actcount'].reset_index(drop=True))

In [105]:
actcountppltest = test.groupby('people_id')['activity_category'].count().reset_index()
actcountppltest.columns = ['people_id','actcount']
actcountppltest = pd.DataFrame(test.merge(actcountppltest, on = 'people_id', how='left' )['actcount'].reset_index(drop=True))

In [106]:
stdscaler = StandardScaler()
actcountppl = pd.DataFrame(stdscaler.fit_transform(actcountppl['actcount'].astype('float').reshape(-1,1)))
stdscaler = StandardScaler()
actcountppltest = pd.DataFrame(stdscaler.fit_transform(actcountppltest['actcount'].astype('float').reshape(-1,1)))

In [142]:
traincountppl = actppl.iloc[:2197291]
testcountppl = actppl.iloc[2197291:]

In [144]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain,actcountppl),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,actcountppltest),format='csr')

In [145]:
score(logreg,Xtrain,y)

0.96547954607541508

In [33]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest),format='csr')

In [34]:
score(logreg,Xtrain,y)

0.97752081067684871

# Feature Engineering 2 - Standardized Count of People per Group

In [72]:
ppltrain = pd.DataFrame(train.groupby(['group_1','people_id'])['activity_category'].count().reset_index().groupby(['group_1'])['people_id'].count())
ppltrain = ppltrain.reset_index()
ppltrain.columns = ['group_1','pplcount']
ppltrain = pd.DataFrame(train.merge(ppltrain, on = 'group_1', how='left' )['pplcount'].reset_index(drop=True))

In [73]:
ppltest = pd.DataFrame(test.groupby(['group_1','people_id'])['activity_category'].count().reset_index().groupby(['group_1'])['people_id'].count())
ppltest = ppltest.reset_index()
ppltest.columns = ['group_1','pplcount']
ppltest = pd.DataFrame(test.merge(ppltest, on = 'group_1', how='left' )['pplcount'].reset_index(drop=True))

In [75]:
stdscaler = StandardScaler()
ppltrain = pd.DataFrame(stdscaler.fit_transform(ppltrain['pplcount'].reshape(-1,1)))
stdscaler = StandardScaler()
ppltest = pd.DataFrame(stdscaler.fit_transform(ppltest['pplcount'].reshape(-1,1)))



In [76]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain, ppltrain),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest, ppltest),format='csr')

In [77]:
score(logreg,Xtrain,y)

0.9773567060967393

# Feature Engineering 3  - Range of Days per Group/Person

Range of Days per Group

In [81]:
def maxmin(x):
    maxdate = x['date_x'].max()
    mindate = x['date_x'].min()
    return maxdate-mindate

In [84]:
dayspergrp = pd.DataFrame(train.groupby('group_1').apply(maxmin))
dayspergrp.columns = ['dayspergrp']
dayspergrp = pd.DataFrame(dayspergrp['dayspergrp'].astype(np.str))
dayspergrp['dayspergrp'] = [d.split()[0] for d in dayspergrp['dayspergrp']]
dayspergrp = pd.DataFrame(train.merge(dayspergrp.reset_index(), on='group_1', how='left')['dayspergrp'].reset_index(drop=True))

In [89]:
dayspergrptest = pd.DataFrame(test.groupby('group_1').apply(maxmin))
dayspergrptest.columns = ['dayspergrp']
dayspergrptest = pd.DataFrame(dayspergrptest['dayspergrp'].astype(np.str))
dayspergrptest['dayspergrp'] = [d.split()[0] for d in dayspergrptest['dayspergrp']]
dayspergrptest = pd.DataFrame(test.merge(dayspergrptest.reset_index(), on='group_1', how='left')['dayspergrp'].reset_index(drop=True))

In [97]:
stdscaler = StandardScaler()
dayspergrp = pd.DataFrame(stdscaler.fit_transform(dayspergrp[0].astype('float').reshape(-1,1)))
stdscaler = StandardScaler()
dayspergrptest = pd.DataFrame(stdscaler.fit_transform(dayspergrptest['dayspergrp'].astype('float').reshape(-1,1)))

In [100]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain, dayspergrp),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,dayspergrptest),format='csr')

In [101]:
score(logreg,Xtrain,y)

0.97367583812876313

Range of Days per Person

In [108]:
daysperppl = pd.DataFrame(train.groupby('people_id').apply(maxmin))
daysperppl.columns = ['daysperppl']
daysperppl = pd.DataFrame(daysperppl['daysperppl'].astype(np.str))
daysperppl['daysperppl'] = [d.split()[0] for d in daysperppl['daysperppl']]
daysperppl = pd.DataFrame(train.merge(daysperppl.reset_index(), on='people_id', how='left')['daysperppl'].reset_index(drop=True))

In [109]:
daysperppltest = pd.DataFrame(test.groupby('people_id').apply(maxmin))
daysperppltest.columns = ['daysperppl']
daysperppltest = pd.DataFrame(daysperppltest['daysperppl'].astype(np.str))
daysperppltest['daysperppl'] = [d.split()[0] for d in daysperppltest['daysperppl']]
daysperppltest = pd.DataFrame(test.merge(daysperppltest.reset_index(), on='people_id', how='left')['daysperppl'].reset_index(drop=True))

In [112]:
stdscaler = StandardScaler()
daysperppl = pd.DataFrame(stdscaler.fit_transform(daysperppl['daysperppl'].astype('float').reshape(-1,1)))
stdscaler = StandardScaler()
daysperppltest = pd.DataFrame(stdscaler.fit_transform(daysperppltest['daysperppl'].astype('float').reshape(-1,1)))

In [113]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain,daysperppl),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,daysperppltest),format='csr')

In [114]:
score(logreg,Xtrain,y)

0.97779183496628974

# Feature Engineering 4 - Leave One Out Categorical Variables

In [126]:
def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName)['outcome'].mean().reset_index()
    grpCount = data1.groupby(columnName)['outcome'].count().reset_index()
    grpOutcomes['cnt'] = grpCount.outcome
    grpOutcomes = grpOutcomes[grpOutcomes.cnt > 39]
    grpOutcomes.drop('cnt', inplace=True, axis=1)
    outcomes = data2['outcome'].values
    x = pd.merge(data2[[columnName, 'outcome']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['outcome']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
        x = x + np.random.normal(0, .05, x.shape[0])
    return x.fillna(x.mean())




In [133]:
LOOtrain = trainwithoutcome[['char_38','group_1','char_2_y','char_13','char_36','outcome']]

In [134]:
lootrain = pd.DataFrame()
for col in LOOtrain.columns:
        if col != 'outcome':
            lootrain[col] = LeaveOneOut(LOOtrain, LOOtrain, col, True).values

In [136]:
LOOtest = test[['char_38','group_1','char_2_y','char_13','char_36']]
LOOtest['outcome'] =0
lootest = pd.DataFrame()
for col in LOOtrain.columns:
    if col != 'outcome':
        lootest[col] = LeaveOneOut(LOOtrain, LOOtest, col, False).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [138]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain,daysperppl,lootrain),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,daysperppltest,lootest),format='csr')

In [139]:
score(logreg,Xtrain,y)



0.98988868937358221

In [140]:
logreg2 = LogisticRegression(solver='sag',max_iter = 300)

In [141]:
score(logreg2,Xtrain,y)

0.98988877731129021

In [142]:
prediction = logreg.predict_proba(Xtest)[:,1]

In [145]:
logreg = pd.DataFrame()
logreg['activity_id'] = test['activity_id']
logreg['outcome'] = prediction

In [152]:
logreg.to_csv('logreg.csv',index=False)

# Feature Engineering 5 - Average of Days Interval per Group/Person

Average of Days Interval per Person

In [119]:
def interval3(x):
    for num in range(0, len(x)):
        if num!=0:
            x['datedelta'].iloc[num] = x['date_x'].iloc[num]-x['date_x'].iloc[(num-1)]
    x['datedelta'] = x['datedelta'].astype(np.str)
    x['datedelta'] = [d.split()[0] for d in x['datedelta']]
    x['datedelta'] = x['datedelta'].astype(np.int)    
    return x

In [116]:
def average(x):
    ave = x['datedelta'].iloc[1:].mean()
    x['average'] = ave
    return x

In [172]:
train['datedelta']=0
interval = train.sort_values(['people_id','date_x'])
interval = interval.groupby('people_id').apply(interval3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [173]:
test['datedelta']=0
intervaltest = test.sort_values(['people_id','date_x'])
intervaltest = intervaltest.groupby('people_id').apply(interval3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [174]:
interval = interval.groupby('people_id').apply(average)

In [175]:
intervaltest = intervaltest.groupby('people_id').apply(average)

In [176]:
aveppltrain = pd.DataFrame(train.merge(interval[['activity_id','average']], on='activity_id',how='left')['average'].reset_index(drop=True))
aveppltest = pd.DataFrame(test.merge(intervaltest[['activity_id','average']], on='activity_id',how='left')['average'].reset_index(drop=True))

In [206]:
aveppltrain.fillna(0, inplace=True)
aveppltest.fillna(0, inplace=True)
stdscaler = StandardScaler()
aveppltrain = pd.DataFrame(stdscaler.fit_transform(aveppltrain['average'].astype('float').reshape(-1,1)))
stdscaler = StandardScaler()
aveppltest = pd.DataFrame(stdscaler.fit_transform(aveppltest['average'].astype('float').reshape(-1,1)))

In [207]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain,daysperppl,lootrain, aveppltrain),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,daysperppltest,lootest, aveppltest),format='csr')

In [208]:
score(logreg,Xtrain,y)

0.98948179933979996

# XGB on Final Set of Features

In [209]:
Xtrain = hstack((train_sparse, train[highcorrnoncat],trainchar38,actcounttrain,daysperppl,lootrain),format='csr')
Xtest = hstack((test_sparse, test[highcorrnoncat],testchar38,actcounttest,daysperppltest,lootest),format='csr')

In [210]:
dtrain = xgb.DMatrix(Xtrain,label=y)
dtest = xgb.DMatrix(Xtest)

In [212]:
param = {'max_depth':11, 'eta':0.05, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.86
param['colsample_bytree']= 0.92
param['min_child_weight'] = 0
param['booster'] = "gbtree"
param['gamma'] = 0.005
param['colsample_bylevel'] = 0.9

In [213]:
watchlist  = [(dtrain,'train')]
num_round = 540
early_stopping_rounds=20
bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

Will train until train error hasn't decreased in 20 rounds.
[0]	train-auc:0.988429
[1]	train-auc:0.988727
[2]	train-auc:0.989084
[3]	train-auc:0.989174
[4]	train-auc:0.989228
[5]	train-auc:0.989291
[6]	train-auc:0.989370
[7]	train-auc:0.989295
[8]	train-auc:0.989389
[9]	train-auc:0.989485
[10]	train-auc:0.989378
[11]	train-auc:0.989390
[12]	train-auc:0.989428
[13]	train-auc:0.989524
[14]	train-auc:0.989473
[15]	train-auc:0.989522
[16]	train-auc:0.989620
[17]	train-auc:0.989652
[18]	train-auc:0.989731
[19]	train-auc:0.989764
[20]	train-auc:0.989846
[21]	train-auc:0.989895
[22]	train-auc:0.989903
[23]	train-auc:0.989933
[24]	train-auc:0.989960
[25]	train-auc:0.989992
[26]	train-auc:0.990082
[27]	train-auc:0.990100
[28]	train-auc:0.990125
[29]	train-auc:0.990151
[30]	train-auc:0.990189
[31]	train-auc:0.990191
[32]	train-auc:0.990219
[33]	train-auc:0.990248
[34]	train-auc:0.990243
[35]	train-auc:0.990272
[36]	train-auc:0.990321
[37]	train-auc:0.990312
[38]	train-auc:0.990354
[39]	train-auc

In [215]:
ypred = bst.predict(dtest)

In [220]:
myxgb = pd.DataFrame({'activity_id': test['activity_id'], 'outcome': ypred})

In [224]:
myxgb.to_csv('myxgb.csv',index=False)

# Random Forest of Final Set of Features

In [226]:
from sklearn.ensemble import RandomForestClassifier

In [227]:
rf = RandomForestClassifier(n_estimators = 500, criterion= 'gini', max_depth = 11, random_state = 500)

In [228]:
rfmodel = rf.fit(Xtrain,y)

In [229]:
rfpred = rf.predict_proba(Xtest)

In [232]:
rfpred = rfpred[:,1]

In [233]:
myrf = pd.DataFrame({'activity_id': test['activity_id'], 'outcome': rfpred})

In [235]:
myrf.to_csv('myrf.csv',index=False)