In [1]:
gpu=0
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=$gpu

%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
fast_path = '/home/kevin/fastai'
sys.path.append(fast_path)

from fastai.learner import *
from fastai.structured import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [124]:
data_path = '/home/kevin/fastai/kaggle2/DonorsChoose/data/'
results_path = '/home/kevin/fastai/kaggle2/DonorsChoose/results/'

In [3]:
def join_df(left, right, left_on, right_on=None):
    if right_on is None: right_on=left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("", "_y"))

### Load data

In [4]:
dtypes = {'project_is_approved': np.bool,
          'teacher_number_of_previously_posted_projects': np.uint8
         }

In [5]:
train = pd.read_csv(data_path + 'train.csv', dtype=dtypes, 
                    parse_dates=['project_submitted_datetime'])
resources = pd.read_csv(data_path + 'resources.csv', dtype=dtypes)
test = pd.read_csv(data_path + 'test.csv', dtype=dtypes, 
                   parse_dates=['project_submitted_datetime'])
sample_submission = pd.read_csv(data_path+ 'sample_submission.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
resources['description'].fillna(' ', inplace=True)

## Merge training data

In [8]:
val_idx = train.sample(frac=0.1).index.values.tolist()
train['role'] = 'trn'
train.loc[val_idx, 'role'] = 'val'
test['role'] = 'tst'
test['project_is_approved'] = False

In [9]:
all_df = train.append(test)

In [11]:
all_df = join_df(all_df, resources, 'id')

In [12]:
all_df['teacher_prefix'].fillna('Teacher', inplace=True)

## Prep training data

In [14]:
#no text columns
non_text_fields = ['teacher_id', 'teacher_prefix', 'school_state', 'project_submitted_datetime',
                  'project_grade_category', 'project_subject_categories', 'project_subject_subcategories',
                  'teacher_number_of_previously_posted_projects', 'quantity', 'project_is_approved',
                   'price', 'role', 'id']
all_df_nt = all_df[non_text_fields].copy()

In [15]:
all_df_nt.rename(columns={'project_submitted_datetime': 'submitted_'}, inplace=True)

In [16]:
add_datepart(all_df_nt, 'submitted_')

In [17]:
cols = list(all_df_nt.columns)

In [18]:
cat_vars = ['teacher_id',
 'teacher_prefix',
 'school_state',
 'project_grade_category',
 'project_subject_categories',
 'project_subject_subcategories',
 'submitted_Year',
 'submitted_Month',
 'submitted_Week',
 'submitted_Day',
 'submitted_Dayofweek',
 'submitted_Dayofyear',
 'submitted_Is_month_end',
 'submitted_Is_month_start',
 'submitted_Is_quarter_end',
 'submitted_Is_quarter_start',
 'submitted_Is_year_end',
 'submitted_Is_year_start']

In [19]:
for v in cat_vars: 
    all_df_nt[v] = all_df_nt[v].astype('category').cat.as_ordered()
df, _, nas, mapper = proc_df(all_df_nt, ignore_flds=['role', 'id', 'project_is_approved'], do_scale=True)

## Split train, valid, test

In [None]:
trn_df = df[df['role']=='trn'].copy()
trn_id = trn_df['id'].values
trn_y = trn_df['project_is_approved'].values
trn_df.drop(['role', 'id','project_is_approved'], axis=1, inplace=True)

In [21]:
val_df = df[df['role']=='val'].copy()
val_id = val_df['id'].values
val_y = val_df['project_is_approved'].values
val_df.drop(['role', 'id','project_is_approved'], axis=1, inplace=True)

In [22]:
tst_df = df[df['role']=='tst'].copy()
tst_id = tst_df['id'].values
tst_df.drop(['role', 'id','project_is_approved'], axis=1, inplace=True)

In [138]:
#checks
print(len(trn_df)+len(val_df)+len(tst_df)==len(df))
print(len(trn_y)==len(trn_df))
print(len(set(list(tst_id))) == len(test))

True
True
True


## Random Forest

In [34]:
x = np.array(trn_df, dtype=np.float32) #speeds up rf in scikit-learn

In [90]:
m_rf = RandomForestClassifier(n_estimators = 10, n_jobs = 8, min_samples_leaf=30)

In [91]:
%time m_rf.fit(x, trn_y)

CPU times: user 20.3 s, sys: 6.77 s, total: 27.1 s
Wall time: 3.62 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [92]:
m_rf.score(trn_df, trn_y)

0.8721325595225888

In [93]:

m_rf.score(val_df, val_y)

0.7789809514872853

In [87]:
rf_feat_importance(m_rf, val_df)

Unnamed: 0,cols,imp
0,teacher_id,0.136592
21,submitted_Elapsed,0.130389
6,teacher_number_of_previously_posted_projects,0.098927
2,school_state,0.093329
5,project_subject_subcategories,0.083228
14,submitted_Dayofyear,0.075757
12,submitted_Day,0.071771
4,project_subject_categories,0.069052
8,price,0.052813
13,submitted_Dayofweek,0.046455


In [88]:
#validation
ev = pd.DataFrame()
ev['y'] = val_y
ev['id'] = val_id
ev['preds'] = m_rf.predict_proba(val_df)[:,-1]
gev = ev.groupby(['id', 'y'])['preds'].mean()
gev = pd.DataFrame(gev).reset_index()
metrics.roc_auc_score(gev['y'].values, gev['preds'].values)

0.5941990286346086

In [103]:
#train
ev = pd.DataFrame()
ev['y'] = trn_y
ev['id'] = trn_id
ev['preds'] = m_rf.predict_proba(trn_df)[:,-1]
gev = ev.groupby(['id', 'y'])['preds'].mean()
gev = pd.DataFrame(gev).reset_index()
metrics.roc_auc_score(gev['y'].values, gev['preds'].values)

0.8963326549544774

# Submission

In [131]:
#test
tst = pd.DataFrame()
tst['id'] = tst_id
tst['project_is_approved'] = m_rf.predict_proba(tst_df)[:,-1]
res = tst.groupby(['id'])['project_is_approved'].mean()
res = pd.DataFrame(res).reset_index()
set(res['id'].values) == set(sample_submission['id'].values) 

True

In [133]:
sample_submission = pd.read_csv(data_path+ 'sample_submission.csv')
sample_submission.drop('project_is_approved', axis=1, inplace=True)

In [134]:
sample_submission = join_df(sample_submission, res, 'id')

In [136]:
fname = 'simple_rf.csv.gz'
sample_submission.to_csv(results_path+ fname, index=False, compression='gzip')
# competition= 'favorita-grocery-sales-forecasting'
res = results_path + fname

In [137]:
!kaggle competitions submit -c 'donorschoose-application-screening' -f $res -m 'simple rf val score .77'

Successfully submitted to DonorsChoose.org Application Screening