##### 18jul Only part 4

Mod4, but without numerical features.

#### scores 0.7774, better than with numerical features (0.8893). Worse than mod3_1.

#### Imports/setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

In [2]:
from sklearn.metrics import f1_score, accuracy_score

import python.flat_to_labels as ftl

In [3]:
# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer, StandardScaler, MaxAbsScaler
# feature selection
from sklearn.feature_selection import chi2, SelectKBest

In [4]:
# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#### Load the data

In [5]:
# Get data
df = pd.read_csv('data/TrainingData.csv', index_col=0)

In [6]:
# take a look
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [7]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(df[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


##### Let's save the unique labels for each output (category)

In [8]:
# build a dictionary
the_labels = {col : df[col].unique().tolist() for col in df[LABELS].columns}

In [9]:
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

#### Setting up for train-test split.

In [10]:
from python.multilabel import multilabel_train_test_split

In [11]:
NUMERIC_COLUMNS = ['FTE', 'Total']

#### metric support

In [12]:
import python.flat_to_labels as ftl

In [13]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))


### Add text processing to the model

#### Combining text columns for tokenization

The tutorial recommends combining all the text columns into a single string.

In [14]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [15]:
# test it
combine_text_columns(df.sample(n=10))

287799    Personal Services - Teachers     Regular Instr...
418416    Extra Duty Pay/Overtime For Support Personnel ...
406348    TRAVEL - EMPLOYEES  MISCELLANEOUS     ADMIN. S...
287503    Supplemental Textbooks  Special Instruction   ...
160777    PURCHASED PROFESSIONAL AND TECH SVCS-OTHER FEE...
62737     CONTRA BENEFITS  GENERAL FUND Custodian Asst I...
50037     OTHER PERSONAL SERVICES          SUB TEACHER A...
260469    NONCAPITALIZED  REMOD/RENOV          School  F...
355412     AFTERSCHOOL PROGRAMS SUMMER ED, ARTS & SPORTS...
300343    EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...
dtype: object

#### ================================ Begin mod4 =========================================

#### get some necessary pieces

In [16]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

### Mod4: Add feature interactions 

The tutorial provides a utility transformer called SparseInteractions. This add features to the model based on the combination of each feature pair.

In [17]:
from python.sparse_interactions import SparseInteractions

# # Instantiate pipeline: pl
# pl = Pipeline([
#         ('union', FeatureUnion(
#             transformer_list = [
#                 ('numeric_features', Pipeline([
#                     ('selector', get_numeric_data),
#                     ('imputer', Imputer())
#                 ])),
#                 ('text_features', Pipeline([
#                     ('selector', get_text_data),
#                     ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
#                                                    ngram_range=(1, 2))),  
#                     ('dim_red', SelectKBest(chi2, chi_k))
#                 ]))
#              ]
#         )),
#         # Now add the interaction features to the selected feature set
#         ('int', SparseInteractions(degree=2)),
#         ('scale', MaxAbsScaler()),
#         ('clf', OneVsRestClassifier(LogisticRegression()))
#     ])

### Use hashing vectorizer instead of CountVectorizer

For the final model they swap vectorizer claiming computational advantages.

In [18]:
# Import the hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# the final model (but with numeric data zeroed out and defaults for HashingVectorizer)
chi_k = 300

mod4 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data_hack),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
    ])

### The final model in the tutorial.  Fit and see how it does.

In [19]:
start = timer()
# Fit to the training data
mod4.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))



fit time: 3218.56 seconds


In [20]:
# Compute and print accuracy
accuracy = mod4.score(X_test, y_test)
print("\nAccuracy on budget dataset: {:0.4f}".format(accuracy) )




Accuracy on budget dataset: 0.8081


In [21]:
print('3277 seconds is {:.4f} minutes.'.format(3277/60))

3277 seconds is 54.6167 minutes.


##### That took a long time.  Accuracy is better.  Now submit.

In [22]:
# Compute and print accuracy
# Print the accuracy scores
start = timer()
mod4_yhat_train = mod4.predict(X_train)
mod4_yhat_test = mod4.predict(X_test)
end = timer()
print('predict time: {} seconds'.format(end - start))



predict time: 283.9253192681508 seconds


In [23]:
print('train accuracy: {:.4f}'.format(accuracy_score(y_train, mod4_yhat_train,)))
print('test accuracy: {:.4f}'.format(accuracy_score(y_train, mod4_yhat_train, )))

train accuracy: 0.8170
test accuracy: 0.8170


In [24]:
start = timer()
mod4_train_probas = mod4.predict_proba(X_train)
mod4_test_probas = mod4.predict_proba(X_test)
end = timer()
print('predict time: {} seconds'.format(end - start))



predict time: 296.60975500530867 seconds


In [25]:
##### save results
np.save('fmm_out/mod4_f_train_probas', mod4_train_probas)
np.save('fmm_out/mod4_test_probas_flat', mod4_test_probas)
np.save('fmm_out/mod4_y_train',       y_train.values)
np.save('fmm_out/mod4_y_test',       y_test.values)

In [27]:
from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES
BPCI = BOX_PLOTS_COLUMN_INDICES

In [28]:
multi_multi_log_loss(mod4_train_probas, y_train.values, BOX_PLOTS_COLUMN_INDICES)

0.13749964917770463

In [29]:
multi_multi_log_loss(mod4_test_probas, y_test.values, BOX_PLOTS_COLUMN_INDICES)

0.14887562239743565

#### Predict because I'm not sure this model was ever scored.

In [30]:
# Load the holdout data: holdout
### Over here the file is TestData.csv
holdout = pd.read_csv('data/TestData.csv', index_col=0)

start = timer()
# Generate predictions: predictions
mod4_text_predictions = mod4.predict_proba(holdout)
end = timer()
print('predict time: {} seconds'.format(end - start))

pred_mod4_text = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
                              index=holdout.index, data=mod4_text_predictions)

pred_mod4_text.to_csv('pred_mod4_text.csv')

  interactivity=interactivity, compiler=compiler, result=result)


predict time: 31.194608205606528 seconds


### This model scores 0.7774, better than with numerical features (0.8893).  Worse than mod3_1.