### See if changing from CountVectorizer to HashingVectorizer is helpful/harmful.

Essentially no difference in using HashingVectorizer.  HashingVectorizer *may* allow n_jobs=-1 in OneVsRest.  

Have not rerun this file after the update.

#### Imports/setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

In [2]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

import python.flat_to_labels as ftl

In [3]:
# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer

In [4]:
# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

#### Load the data

In [5]:
# Get data
df = pd.read_csv('data/TrainingData.csv', index_col=0)

In [6]:
# take a look
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [7]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(df[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


##### Let's save the unique labels for each output (category)

In [8]:
# build a dictionary
the_labels = {col : df[col].unique().tolist() for col in df[LABELS].columns}

In [9]:
the_labels['Use']

['Teacher Compensation',
 'NO_LABEL',
 'Substitute Compensation',
 'Facilities & Maintenance',
 'Instructional Materials & Supplies',
 'Food Services',
 'Security & Safety',
 'Utilities',
 'Student Transportation',
 'Parent & Community Relations',
 'Extended Time & Tutoring',
 'Enrichment',
 'Special Population Program Management & Support',
 'School Supervision',
 'Data Processing & Information Services',
 'Aides Compensation',
 'Physical Health & Services',
 'Career & Academic Counseling',
 'Library & Media',
 'Professional Development',
 'School Administration',
 'Other Non-Compensation',
 'Social & Emotional',
 'Finance, Budget, Purchasing & Distribution',
 'Human Resources',
 'Curriculum Development',
 'Legal',
 'Other Compensation',
 'Student Assignment',
 'Governance',
 'Development & Fundraising',
 'Research & Accountability',
 'Recruitment',
 'Insurance',
 'Untracked Budget Set-Aside',
 'Communications',
 'Facilities Planning']

#### Skip the first models; go directly to mod1_1

In [10]:
from python.multilabel import multilabel_train_test_split

In [11]:
NUMERIC_COLUMNS = ['FTE', 'Total']

#### Show metrics for each target and average for all targets.

In [12]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F1 score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))


In [13]:
from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

---

### Add text processing to the model

#### Combining text columns for tokenization

The tutorial recommends combining all the text columns into a single string.  The function __*combine_text_columns()*__ drops all targets and non-text data, fills nans with empty string and joins all text columns within a row.

In [14]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [15]:
# test it
combine_text_columns(df.sample(n=10))

356044    Salaries Or Wages For Support Personnel  Opera...
275617    EMPLOYEE BENEFITS BOND EXPENDITURES BUILDING F...
286119    SALARIES OF PART TIME EMPLOYEE CUSTODIAL-SCHOO...
188529    EMPLOYEE BENEFITS  ITEMGD PRESCHOOL PROGRAM Ea...
277806    CONTRA BENEFITS  GENERAL FUND Teacher, Long Te...
387649    Regular *  Special Instruction    Certificated...
276708    SUPPLIES  PRE-KINDERGARTEN ARRA       INSTRUCT...
266531       Teacher-Middle School       MS SOCIAL SCIEN...
420785     GENERAL EDUCATION LOCAL TEACHER,SCIENCE,CHEMI...
170672    RETIREMENT CONTRIB.  GENERAL FUND Custodian - ...
dtype: object

#### ============= Beginning of Mod1_2; just the text features, use Hashing Vectorizer ===========================

#### Funny thing, but when I simplify the pipeline (remove feature union and selection/preprocessing for numeric data), OneVsRest fails with n_jobs=-1.  Runs without it, but slow (~2x).

In [22]:
# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]

In [23]:
TEXT_COLS = sorted(list(set(NON_LABELS) - set(['FTE', 'Total']))); TEXT_COLS

['Facility_or_Department',
 'Function_Description',
 'Fund_Description',
 'Job_Title_Description',
 'Location_Description',
 'Object_Description',
 'Position_Extra',
 'Program_Description',
 'SubFund_Description',
 'Sub_Object_Description',
 'Text_1',
 'Text_2',
 'Text_3',
 'Text_4']

#### Set up X/y train/test.

In [24]:
# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[TEXT_COLS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

#### Build the pipeline, but ignore numerical features

* get rid of FeatureUnion
* only select/process the text data

##### Substituting HashingVectorizer for CountVectorizer allows n_jobs=-1.  Very nearly as good.  The tutorial has some suggestions for how to make HV a reasonable drop-in replacement for CV.

In [25]:
### This time only send in the text data
mod1_2 = Pipeline([('selector', get_text_data),
                   ('vectorizer', HashingVectorizer(non_negative=True, norm=None, binary=False)),
                   ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
                  ])
 
start = timer()
# Fit to the training data
mod1_2.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))



fit time: 575.18 seconds


In [26]:
### For log loss we need the probabilities, not the predicted labels
start = timer()
mod1_2_yhat_train_probas = mod1_2.predict_proba(X_train)
mod1_2_yhat_test_probas = mod1_2.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))



Predict.proba time: 16.81 seconds


In [27]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(mod1_2_yhat_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(mod1_2_yhat_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

log loss on training set: 0.0874
log loss on test set: 0.0940


In [28]:
report_f1(ftl.flat_to_labels(y_test.values), ftl.flat_to_labels(mod1_2_yhat_test_probas))

F1 score for target Function: 0.955
F1 score for target Object_Type: 0.984
F1 score for target Operating_Status: 0.984
F1 score for target Position_Type: 0.982
F1 score for target Pre_K: 0.990
F1 score for target Reporting: 0.973
F1 score for target Sharing: 0.962
F1 score for target Student_Type: 0.973
F1 score for target Use: 0.961
Average F1 score for all targets : 0.974


In [29]:
report_accuracy(ftl.flat_to_labels(y_test.values), ftl.flat_to_labels(mod1_2_yhat_test_probas))

Accuracy score for target Function: 0.955
Accuracy score for target Object_Type: 0.984
Accuracy score for target Operating_Status: 0.985
Accuracy score for target Position_Type: 0.983
Accuracy score for target Pre_K: 0.990
Accuracy score for target Reporting: 0.973
Accuracy score for target Sharing: 0.962
Accuracy score for target Student_Type: 0.973
Accuracy score for target Use: 0.961
Average accuracy score for all targets : 0.974


### This is the same prediction quality as CountVectorizer version and  faster (n_jobs=-1).    

Can go even faster (~5 min) by using default parameters to HashVec.  F1 score is a only a smidge lower (0.014).  See below.

### Interesting to see if this scores higher or lower than CV version.  It should be more robust to unseen tokens.

In [30]:
# # Load the holdout data: holdout
# ### Over here the file is TestData.csv
# holdout = pd.read_csv('data/TestData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
# start = timer()
# # Generate predictions: predictions
# mod1_2_predictions = mod1_2.predict_proba(holdout)
# end = timer()
# print('predict time: {} seconds'.format(end - start))



predict time: 2.0510716203698394 seconds


In [32]:
# pred_mod1_2 = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
#                              index=holdout.index,
#                              data=mod1_2_predictions)

# pred_mod1_2.to_csv('pred_mod1_2.csv')

### 0.6829 on holdout set at Drivendata; essentially equivalent to previous

#### ======================== End of Mod1_2 ===================================

#### ======================== Begin Mod1_3; use 1 character tokens ===================================

In [34]:
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

### Only send in the text data
mod1_3 = Pipeline([('selector', get_text_data),
                   ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                    non_negative=True, norm=None, binary=False)),
                   ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
                  ])
start = timer()
# Fit to the training data
mod1_3.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))



fit time: 556.18 seconds


In [35]:
### For log loss we need the probabilities, not the predicted labels
start = timer()
mod1_3_yhat_train_probas = mod1_3.predict_proba(X_train)
mod1_3_yhat_test_probas = mod1_3.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))



Predict.proba time: 16.23 seconds


In [36]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(mod1_3_yhat_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(mod1_3_yhat_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

log loss on training set: 0.0963
log loss on test set: 0.1032


In [37]:
report_f1(ftl.flat_to_labels(y_test.values), ftl.flat_to_labels(mod1_3_yhat_test_probas))

F1 score for target Function: 0.949
F1 score for target Object_Type: 0.983
F1 score for target Operating_Status: 0.984
F1 score for target Position_Type: 0.975
F1 score for target Pre_K: 0.990
F1 score for target Reporting: 0.972
F1 score for target Sharing: 0.959
F1 score for target Student_Type: 0.974
F1 score for target Use: 0.955
Average F1 score for all targets : 0.971


In [38]:
report_accuracy(ftl.flat_to_labels(y_test.values), ftl.flat_to_labels(mod1_3_yhat_test_probas))

Accuracy score for target Function: 0.949
Accuracy score for target Object_Type: 0.983
Accuracy score for target Operating_Status: 0.984
Accuracy score for target Position_Type: 0.975
Accuracy score for target Pre_K: 0.990
Accuracy score for target Reporting: 0.972
Accuracy score for target Sharing: 0.959
Accuracy score for target Student_Type: 0.974
Accuracy score for target Use: 0.955
Average accuracy score for all targets : 0.971


### Changing to use single character tokens doesn't seem to help.  Quality slightly lower.

### Not worth submitting this one.

In [None]:
# # Load the holdout data: holdout
# ### Over here the file is TestData.csv
# holdout = pd.read_csv('data/TestData.csv', index_col=0)

In [None]:
# start = timer()
# # Generate predictions: predictions
# mod1_3_predictions = mod1_3.predict_proba(holdout)
# end = timer()
# print('predict time: {} seconds'.format(end - start))

In [None]:
# pred_mod1_3 = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
#                              index=holdout.index,
#                              data=mod1_3_predictions)

# pred_mod1_3.to_csv('pred_mod1_3.csv')



***