In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
from pylab import *
from datetime import date

In [45]:
# Load train, test, and people datasets
people = pd.read_csv('people.csv')
act_train = pd.read_csv('act_train.csv')
act_test = pd.read_csv('act_test.csv')

In [46]:
# Preprocess the people data

# Drop the "ppl_" part of each people_id and convert the remaining number to an integer
people['people_id'] = people['people_id'].apply(lambda x: x.split('_')[1])
people['people_id'] = pd.to_numeric(people['people_id']).astype(int)
    
#  Looking at the data, we see that the first 11 columns after people_id are strings and the remaining are Boolean  
peopleCols = list(people.columns)
stringCols = peopleCols[1:12]
booleanCols = peopleCols[12:]
del stringCols[3]  # Do not perform subsequent preprocessing on "date" column (preprocessing will occur later)

In [47]:
# For each string column, we perform two steps. First, we convert any null value to a placeholder "type 0".
# Next, we note that all strings come in the form "type 10" or "group 45", so we will keep only the numeric part
# and convert it to an integer
for col in stringCols:
    people[col] = people[col].fillna('type 0')
    people[col] = people[col].apply(lambda x: x.split(' ')[1])
    people[col] = pd.to_numeric(people[col]).astype(int)

In [48]:
# For each column with Boolean values, we convert them to 0/1
for col in booleanCols:
    people[col] = pd.to_numeric(people[col]).astype(int) 

In [49]:
# Preprocessing the act_train data

# Save the ids and labels for random forest training later
train_labels = act_train['outcome']
train_ids = act_train['activity_id']

# Drop those columns from the act_train data
act_train = act_train.drop(['activity_id','outcome'], axis=1)
    
# Once again, drop the "ppl_" part of each people_id and convert the remaining number to an integer
act_train['people_id'] = act_train['people_id'].apply(lambda x: x.split('_')[1])
act_train['people_id'] = pd.to_numeric(act_train['people_id']).astype(int)

# All activity_ids come in the form "act1_49312375". We can engineer two new features, the activity_id_type,
# which is the number next to "act", and the activity_id_num, which is the integer after the underbar.
train_act_id_type = train_ids.apply(lambda x: int(x.split("_")[0][-1]))
train_act_id_type = train_act_id_type.rename("act_id_type")
train_act_id_num = train_ids.apply(lambda x: int(float(x.split("_")[1])))
train_act_id_num = train_act_id_num.rename("act_id_num")

# Concatenate these new feature columns to the data
act_train = pd.concat([act_train, train_act_id_type, train_act_id_num], axis=1)

# Extract all column names so we can loop through and preprocess each of them
actTrainCols = list(act_train.columns)
del actTrainCols[1]  # Do not perform subsequent preprocessing on "date" column (preprocessing will occur later)
del actTrainCols[-2:] # Do not perform subsequent preprocessing on new "act_id_type" and "act_id_num" columns

In [54]:
# Nearly all columns in act_train have string data that looks like "type 40", etc. Once again, we will we convert
# any null value to placeholder "type 0". Then we will drop the nonnumeric part and recast the string to an integer
for col in actTrainCols[1:]:
    act_train[col] = act_train[col].fillna('type 0')
    act_train[col] = act_train[col].apply(lambda x: x.split(' ')[1])
    act_train[col] = pd.to_numeric(act_train[col]).astype(int)

In [57]:
# Preprocessing the act_test dataset

# Save activity ids for submission file
test_ids = act_test['activity_id']

# Drop the activity_id column for now. We will add the new features "act_id_type" and "act_id_num" shortly
act_test = act_test.drop(['activity_id'], axis=1)

# Once again, drop the "ppl_" part of each people_id and convert the remaining number to an integer
act_test['people_id'] = act_test['people_id'].apply(lambda x: x.split('_')[1])
act_test['people_id'] = pd.to_numeric(act_test['people_id']).astype(int)

# Add the new features extracted from the activity_ids to the test dataset
test_act_id_type = test_ids.apply(lambda x: int(x.split("_")[0][-1]))
test_act_id_type = test_act_id_type.rename("act_id_type")
test_act_id_num = test_ids.apply(lambda x: int(float(x.split("_")[1])))
test_act_id_num = test_act_id_num.rename("act_id_num")

# Concatenate these new feature columns to the data
act_test = pd.concat([act_test, test_act_id_type, test_act_id_num], axis=1)

# Extract all column names so we can loop through and preprocess each of them
actTestCols = list(act_test.columns)
del actTestCols[1]  # Do not perform subsequent preprocessing on "date" column (preprocessing will occur later)
del actTestCols[-2:] # Do not perform subsequent preprocessing on new "act_id_type" and "act_id_num" columns

In [60]:
# Nearly all columns in act_train have string data that looks like "type 40", etc. Once again, we will we convert
# any null value to placeholder "type 0". Then we will drop the nonnumeric part and recast the string to an integer
for col in actTestCols[1:]:
    act_test[col] = act_test[col].fillna('type 0')
    act_test[col] = act_test[col].apply(lambda x: x.split(' ')[1])
    act_test[col] = pd.to_numeric(act_test[col]).astype(int)

In [66]:
# There are more people_ids than there are listed people_ids in the people dataset. Hence, we perform a left 
# outer merge between the act_train/act_test datasets and people.

# Training dataset 
features = act_train.merge(people, how='left', on='people_id')

# Testing dataset
test = act_test.merge(people, how='left', on='people_id')

In [67]:
# Compute elapsed time feature for training data
features['date_x'] = features['date_x'].apply(lambda x: datetime.date(*list(map(int, x.split("-")))))
features['date_y'] = features['date_y'].apply(lambda x: datetime.date(*list(map(int, x.split("-")))))
features['date_x'] = features['date_x'] - features['date_y']
features['date_x'] = features['date_x'].apply(lambda x: x.days)
features = features.drop(['date_y'], axis=1)

# Visualize training data after full preprocessing
features.head()

Unnamed: 0,people_id,date_x,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_10_x,act_id_type,act_id_num,char_1_y,group_1,char_2_y,char_3_y,char_4_y,char_5_y,char_6_y,char_7_y,char_8_y,char_9_y,char_10_y,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,100,788,4,0,0,0,0,0,0,0,0,0,76,2,1734928,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36
1,100,455,2,0,0,0,0,0,0,0,0,0,1,2,2434093,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36
2,100,455,2,0,0,0,0,0,0,0,0,0,1,2,3404049,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36
3,100,766,2,0,0,0,0,0,0,0,0,0,1,2,3651215,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36
4,100,788,2,0,0,0,0,0,0,0,0,0,1,2,4109017,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36


In [68]:
# Compute elapsed time feature for test data
test['date_x'] = test['date_x'].apply(lambda x: datetime.date(*list(map(int, x.split("-")))))
test['date_y'] = test['date_y'].apply(lambda x: datetime.date(*list(map(int, x.split("-")))))
test['date_x'] = test['date_x'] - test['date_y']
test['date_x'] = test['date_x'].apply(lambda x: x.days)
test = test.drop(['date_y'], axis=1)

# Visualize test data after full preprocessing
test.head()

Unnamed: 0,people_id,date_x,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_10_x,act_id_type,act_id_num,char_1_y,group_1,char_2_y,char_3_y,char_4_y,char_5_y,char_6_y,char_7_y,char_8_y,char_9_y,char_10_y,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,100004,0,1,5,10,5,1,6,1,1,7,4,0,1,249281,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76
1,100004,0,5,0,0,0,0,0,0,0,0,0,682,2,230855,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76
2,10001,0,1,12,1,5,4,6,1,1,13,10,0,1,240724,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90
3,10001,44,1,20,10,5,4,6,1,1,5,5,0,1,83552,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90
4,10001,1,5,0,0,0,0,0,0,0,0,0,3015,2,1043301,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90


In [70]:
# Split training data for cross-validation
from sklearn.cross_validation import train_test_split

num_test = 0.2  # Use 20% for cross-validation
X_train, X_test, y_train, y_test = train_test_split(features, train_labels, test_size=num_test, random_state=37)

# Load random forest classfier from sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV

# Train random forest
clf = RandomForestClassifier(n_estimators=40, n_jobs=-1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
# Training predictions on part of train set used for cross-validation
proba = clf.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))

Area under ROC 0.9995160635349157


In [75]:
# Apply random forest to test dataset
test_proba = clf.predict_proba(test)
test_preds = test_proba[:,1]

# Put into format for Kaggle submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.to_csv('redhat_3features_predict_proba.csv', index = False)