### What can we do with random forest clf if we start with best features judging by best performance with logistic regression.

#### Okay, since overfitting seems to be the problem, let's not dive so deep.  How far will we need to come up to get oob/train to at least be close to test?

In [1]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MaxAbsScaler, Imputer
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Import CountVectorizer and HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

from sklearn.feature_selection import chi2, SelectKBest

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

#### Load the data

In [2]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [3]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


#### Save the unique labels for each output (category)

In [4]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

In [5]:
[(col, len(the_labels[col])) for col in the_data[LABELS].columns]

[('Function', 37),
 ('Object_Type', 11),
 ('Operating_Status', 3),
 ('Position_Type', 25),
 ('Pre_K', 3),
 ('Reporting', 3),
 ('Sharing', 5),
 ('Student_Type', 9),
 ('Use', 8)]

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [6]:
# downsize it or not
# df = the_data.sample(frac=0.5, random_state=99)
df = the_data

#### Get targets as set of one-hot encoded columns

In [7]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Setting up a train-test split  for modeling

#### =========== Begin ModRF; use random forest on features from best logistic regression model  ======================

Some things to note about the default CountVectorizer:
1) All strings are downcased
2) The default setting selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).  This means single letter or digit tokens are ignored.
3) If the vectorizer is used to transform another input (e.g. test), any tokens not in the original corpus are ignored.

In [8]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

#### For RF, I think it will be fine to use the labels as them come (instead of binarizing).  The probabilities will come out that way.  They may be in a list with one element per target.

Slight change of plans: multilabel_train_test_split only works with binary indicator matrices.  So a quick workaround is to dummy the labels, then do the split.  Then use the indices of the y_train/y_test to get the original ys that we want to use.

For mmll we need the label probabilities as array of shape (num_samples, 104).  Can get this format by calling np.hstack on proba output.

Another ramification: mmll also wants ys in binarized format.  So we need those to do the comparison.

In [9]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

In [10]:
y_train_bin = y_train; y_test_bin = y_test

In [11]:
y_train_bin.shape

(320222, 104)

In [12]:
np.sort(y_train_bin.iloc[:,0:37].sum(axis=0))

array([   21,    37,    75,    95,   233,   269,   273,   401,   481,
         615,  1061,  1134,  1453,  1551,  1753,  1824,  2115,  2299,
        2324,  2642,  3546,  3589,  3979,  4727,  6056,  6687,  8593,
       10387, 11303, 11466, 15331, 15588, 15795, 15877, 47722, 49803,
       69117], dtype=int64)

In [13]:
### now get the original ys
y_train = df.loc[y_train_bin.index, LABELS]; y_train.shape

(320222, 9)

In [14]:
y_train['Function'].value_counts()

Teacher Compensation                               69117
Substitute Compensation                            49803
NO_LABEL                                           47722
Aides Compensation                                 15877
Instructional Materials & Supplies                 15795
Facilities & Maintenance                           15588
Professional Development                           15331
Student Transportation                             11466
Food Services                                      11303
School Administration                              10387
Enrichment                                          8593
Extended Time & Tutoring                            6687
Curriculum Development                              6056
Physical Health & Services                          4727
Social & Emotional                                  3979
Library & Media                                     3589
Special Population Program Management & Support     3546
Data Processing & Information S

In [15]:
y_test = df.loc[y_test_bin.index, LABELS]; y_test.shape

(80055, 9)

In [16]:
(y_test.index == y_test_bin.index).all()

True

In [17]:
y_test.shape, y_test_bin.shape

((80055, 9), (80055, 104))

In [18]:
(y_train.index == y_train_bin.index).all()

True

In [19]:
y_train.shape, y_train_bin.shape

((320222, 9), (320222, 104))

In [20]:
features = Pipeline([('selector', get_text_data), 
                     ('vectorizer', CountVectorizer(ngram_range=(1,2)))]).fit_transform(X_train)

In [21]:
type(features)

scipy.sparse.csr.csr_matrix

In [22]:
features.shape

(320222, 31010)

In [116]:
# Insert random forest clf instead of onevsrest
ModRF = Pipeline([
    ('union', FeatureUnion(transformer_list = [
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data_hack),
            ('imputer', Imputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(ngram_range=(1,2)))
            # no dimension reduction 
        ]))
    ])),
    ('rf', RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42,
                                  min_samples_leaf = 16, n_jobs=3, verbose=1))])

#### Fit  took 5 min without n_jobs=-1 and all default parameters
* 100 trees with minleaf 10 took ~300 sec
* 200 trees with minleaf 10 took 500 sec
* 200 trees with minleaf 5 took 800 sec
* 200 trees with minleaf 2 took 1626 sec 96% oob
* 200 trees with minleaf 1 took 7.6k sec 96% oob

min leaf 1 doesn't buy much for all that extra time.
('rf', RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42, max_features=100,
                                  min_samples_leaf = 4, n_jobs=-1, verbose=2))])
650sec; oob 92

('rf', RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42, max_features=100,
                                  min_samples_leaf = 1, n_jobs=-1, verbose=2))])
1200 sec; oob 92.5



In [117]:
start = timer()
# Fit to the training data
ModRF.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   55.8s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:  2.1min finished


fit time: 206.58 seconds


In [118]:
the_clf = ModRF.steps[-1][1]

In [119]:
print('Out-of-bag evaluation score: {}%'.format(round(the_clf.oob_score_, 4) * 100))

Out-of-bag evaluation score: 90.89%


In [120]:
# get probas
start = timer()
ModRF_train_probas = ModRF.predict_proba(X_train)
ModRF_test_probas = ModRF.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   26.4s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   58.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    6.3s


Predict.proba time: 91.53 seconds


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   14.4s finished


#### For log loss we need the probabilities, not the predicted labels

In [121]:
np.hstack(ModRF_train_probas).shape, dummy_labels.loc[y_train.index, :].values.shape

((320222, 104), (320222, 104))

In [122]:
print('log loss on training set: {:0.4f}'.format(
    multi_multi_log_loss(np.hstack(ModRF_train_probas),
                         dummy_labels.loc[y_train.index, :].values, 
                         BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(
    multi_multi_log_loss(np.hstack(ModRF_test_probas),
                         dummy_labels.loc[y_test.index, :].values, 
                         BOX_PLOTS_COLUMN_INDICES)))

log loss on training set: 0.3845
log loss on test set: 0.3877


#### For standard metrics we need the yhats and the ys.

In [50]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F1 score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))

In [63]:
# ftl wants ndarray, not pd.Dataframe
the_ys = ftl.flat_to_labels(np.hstack(ModRF_test_probas))

In [52]:
y_test.shape

(80055, 9)

In [53]:
ftl.flat_to_labels(np.hstack(ModRF_test_probas)).shape

(80055, 9)

In [64]:
report_f1(y_test.values, ftl.flat_to_labels(np.hstack(ModRF_test_probas)))

F1 score for target Function: 0.947
F1 score for target Object_Type: 0.978
F1 score for target Operating_Status: 0.984
F1 score for target Position_Type: 0.969
F1 score for target Pre_K: 0.987
F1 score for target Reporting: 0.977
F1 score for target Sharing: 0.967
F1 score for target Student_Type: 0.970
F1 score for target Use: 0.962
Average F1 score for all targets : 0.971


In [65]:
report_accuracy(y_test.values, ftl.flat_to_labels(np.hstack(ModRF_test_probas)))

Accuracy score for target Function: 0.948
Accuracy score for target Object_Type: 0.978
Accuracy score for target Operating_Status: 0.984
Accuracy score for target Position_Type: 0.970
Accuracy score for target Pre_K: 0.987
Accuracy score for target Reporting: 0.977
Accuracy score for target Sharing: 0.968
Accuracy score for target Student_Type: 0.970
Accuracy score for target Use: 0.963
Average accuracy score for all targets : 0.972


#### =================================================================================================

In [36]:
# Load the holdout data: holdout
### Over here the file is TestData.csv
holdout = pd.read_csv('data/TestData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [37]:
start = timer()
# Generate predictions: predictions
ModRF_predictions = ModRF.predict_proba(holdout)
end = timer()
print('predict time: {} seconds'.format(end - start))

predict time: 4.806062119107651 seconds


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    2.3s finished


####  Okay, that's got it.

In [123]:
pred_ModRF = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
                             index=holdout.index,
                             data=np.hstack(ModRF_predictions))

pred_ModRF.to_csv('pred_RF_16_100.csv')

#### 2.0893  - that's not great.

#### =============================== End of mod_1_1_2 ============================================

***