### Experiments with RF classifier

It started as one, but turned into many experiments.

The last clf with 100 trees and min_leaf looks like a good candidate.

In [1]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors, 
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MaxAbsScaler, Imputer
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Import CountVectorizer and HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

from sklearn.feature_selection import chi2, SelectKBest

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report, log_loss

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

#### Load the data

In [2]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [3]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


#### Save the unique labels for each output (category)

In [4]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [5]:
# downsize it or not
# df = the_data.sample(frac=0.1)
df = the_data

#### Get targets as set of one-hot encoded columns

In [6]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Setting up a train-test split  for modeling

#### =========== Begin ModRF; use random forest on features from best logistic regression model  ======================

Some things to note about the default CountVectorizer:
1) All strings are downcased
2) The default setting selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).  This means single letter or digit tokens are ignored.
3) If the vectorizer is used to transform another input (e.g. test), any tokens not in the original corpus are ignored.

In [7]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

#### For RF, I think it will be fine to use the labels as them come (instead of binarizing).  The probabilities will come out that way.  They may be in a list with one element per target.

Slight change of plans: multilabel_train_test_split only works with binary indicator matrices.  So a quick workaround is to dummy the labels, then do the split.  Then use the indices of the y_train/y_test to get the original ys that we want to use.

For mmll we need the label probabilities as array of shape (num_samples, 104).  Can get this format by calling np.hstack on proba output.

Another ramification: mmll also wants ys in binarized format.  So we need those to do the comparison.

In [8]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

In [9]:
### now get the original ys
y_train = df.loc[y_train.index, LABELS]; y_train.shape

(320222, 9)

In [10]:
y_test = df.loc[y_test.index, LABELS]; y_test.shape

(80055, 9)

In [55]:
# Insert random forest clf instead of onevsrest
ModRF = Pipeline([
    ('union', FeatureUnion(transformer_list = [
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data_hack),
            ('imputer', Imputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(ngram_range=(1,2)))
            # no dimension reduction 
        ]))
    ])),
    ('rf', RandomForestClassifier(n_jobs=-1, n_estimators=100, min_samples_leaf=2, oob_score=True, verbose=5))])

#### Fit  took 6 min with n_jobs=-1 and all default parameters oob_score 0.9644,  log loss 0.1757, agg F1 0.977
#### Fit  took 12 min with n_jobs=-1, 20 trees, oob_score 0.97562,  log loss 0.1401, agg F1 0.977
#### Fit  took 7 min with n_jobs=-1, 50 trees, min_samples_leaf=2 oob_score 0.959,  log loss 0.205, agg F1 0.943
#### Fit  took 20+ min with n_jobs=-1, 100 trees, min_samples_leaf=2 oob_score 0.94,  log loss 0.205, agg F1 0.960; also train/test are very close...

In [56]:
start = timer()
# Fit to the training data
ModRF.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

building tree 1 of 100
building tree 2 of 100building tree 3 of 100building tree 4 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.3min


building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  8.0min


building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 11.9min finished


fit time: 800.47 seconds


In [57]:
rf = ModRF.steps[-1][1]

In [120]:
len(rf.estimators_)

100

In [None]:
# could get the top 10 features from rf and multiply them to get 100 new features, interactions of the 10 most important.
# if I could figure out how to do it.

In [58]:
np.sort(rf.feature_importances_)[-20:]

array([0.00515112, 0.00539933, 0.00556919, 0.00562661, 0.00566168,
       0.00599308, 0.00606554, 0.00618705, 0.00622022, 0.0062467 ,
       0.00643697, 0.00652136, 0.00654766, 0.00655152, 0.00704131,
       0.00724354, 0.00933722, 0.00974482, 0.010126  , 0.01892051])

In [121]:
rf.oob_score_

0.9359163330439507

In [60]:
# get probas
start = timer()
ModRF_train_probas = ModRF.predict_proba(X_train)
ModRF_test_probas = ModRF.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   33.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   51.7s finished
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    8.1s


Predict.proba time: 84.49 seconds


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.9s finished


#### For log loss we need the probabilities, not the predicted labels

In [122]:
print('log loss on training set: {:0.4f}'.format(
    multi_multi_log_loss(np.hstack(ModRF_train_probas),
                         dummy_labels.loc[y_train.index, :].values, 
                         BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(
    multi_multi_log_loss(np.hstack(ModRF_test_probas),
                         dummy_labels.loc[y_test.index, :].values, 
                         BOX_PLOTS_COLUMN_INDICES)))

log loss on training set: 0.1964
log loss on test set: 0.1956


#### For standard metrics we need the yhats and the ys.

In [62]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F1 score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))
    


In [126]:
y_train.shape, ftl.flat_to_labels(np.hstack(ModRF_train_probas)).shape

((320222,), (320222, 9))

In [64]:
report_f1(y_test.values, ftl.flat_to_labels(np.hstack(ModRF_test_probas)))

report_accuracy(y_test.values, ftl.flat_to_labels(np.hstack(ModRF_test_probas)))

F1 score for target Function: 0.930
F1 score for target Object_Type: 0.970
F1 score for target Operating_Status: 0.979
F1 score for target Position_Type: 0.954
F1 score for target Pre_K: 0.981
F1 score for target Reporting: 0.969
F1 score for target Sharing: 0.954
F1 score for target Student_Type: 0.957


  'precision', 'predicted', average, warn_for)


F1 score for target Use: 0.945
Average F1 score for all targets : 0.960
Accuracy score for target Function: 0.932
Accuracy score for target Object_Type: 0.971
Accuracy score for target Operating_Status: 0.980
Accuracy score for target Position_Type: 0.955
Accuracy score for target Pre_K: 0.982
Accuracy score for target Reporting: 0.969
Accuracy score for target Sharing: 0.955
Accuracy score for target Student_Type: 0.959
Accuracy score for target Use: 0.947
Average accuracy score for all targets : 0.961


In [102]:
BOX_PLOTS_COLUMN_INDICES

[range(0, 37),
 range(37, 48),
 range(48, 51),
 range(51, 76),
 range(76, 79),
 range(79, 82),
 range(82, 87),
 range(87, 96),
 range(96, 104)]