#### Using best scored model: add back numerical feature total (with preprocessing); do not scale before classification

#### mod3_1 with regularization, C=0.0333; include preprocessed Total feature

#### score is 0.6237

```
LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                   class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', 
                   verbose=0, warm_start=False, n_jobs=1)
```

In [1]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MaxAbsScaler
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

#### Load the data

In [2]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


In [3]:
the_data.shape

(400277, 25)

In [4]:
the_data.columns

Index(['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type',
       'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status',
       'Object_Description', 'Text_2', 'SubFund_Description',
       'Job_Title_Description', 'Text_3', 'Text_4', 'Sub_Object_Description',
       'Location_Description', 'FTE', 'Function_Description',
       'Facility_or_Department', 'Position_Extra', 'Total',
       'Program_Description', 'Fund_Description', 'Text_1'],
      dtype='object')

####  Encode the targets as categorical variables

In [7]:
the_data.dtypes

Function                  category
Use                       category
Sharing                   category
Reporting                 category
Student_Type              category
Position_Type             category
Object_Type               category
Pre_K                     category
Operating_Status          category
Object_Description          object
Text_2                      object
SubFund_Description         object
Job_Title_Description       object
Text_3                      object
Text_4                      object
Sub_Object_Description      object
Location_Description        object
FTE                        float64
Function_Description        object
Facility_or_Department      object
Position_Extra              object
Total                      float64
Program_Description         object
Fund_Description            object
Text_1                      object
dtype: object

In [6]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


In [7]:
the_data[LABELS].head()

Unnamed: 0,Function,Object_Type,Operating_Status,Position_Type,Pre_K,Reporting,Sharing,Student_Type,Use
134338,Teacher Compensation,NO_LABEL,PreK-12 Operating,Teacher,NO_LABEL,School,School Reported,NO_LABEL,Instruction
206341,NO_LABEL,NO_LABEL,Non-Operating,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL
326408,Teacher Compensation,Base Salary/Compensation,PreK-12 Operating,Teacher,Non PreK,School,School Reported,Unspecified,Instruction
364634,Substitute Compensation,Benefits,PreK-12 Operating,Substitute,NO_LABEL,School,School Reported,Unspecified,Instruction
47683,Substitute Compensation,Substitute Compensation,PreK-12 Operating,Teacher,NO_LABEL,School,School Reported,Unspecified,Instruction


#### Save the unique labels for each output (category)

In [3]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

NameError: name 'LABELS' is not defined

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [5]:
# downsize it or not
df = the_data.sample(frac=1.0, random_state=777) # this seed gets a split with enough labels in both sets
# df = the_data.sample(frac=0.1, random_state=777)

In [6]:
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
223673,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,Purchased Services,,Community Services,,,,Purchased Services,,,Non-Public School Services *,,,124.25,,Title I - Disadvantaged Children/Targeted Assi...,ARRA-NONPUBLIC BLDG ALLOC
49034,Extended Time & Tutoring,Instruction,School Reported,School,Unspecified,Sec/Clerk/Other Admin,Other Compensation/Stipend,NO_LABEL,PreK-12 Operating,ADDITIONAL/EXTRA DUTY PAY/STIP,,DISTRICT SPECIAL REVENUE FUNDS,Secretary II,Regular,,,,,LIGHTS ON AFTER SCHOOL,,PROFESSIONAL-INSTRUCTIONAL,328.39024,AFTER SCHOOL PROGRAMS,,COCURRICULAR ED/ACTIVITIES
268044,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Regular *,TEACHER,Regular Instruction,"TCHR, K-8 PRIMARY (GR",,,Certificated Employees Salaries And Wages,EN OTH CERTIFICATED PERSON,1.0,Elementary,,,109255.516638,,General,SCHOOL BASED MANAGEMENT
204167,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,3RD GRADE,61256.28,PRIMARY GRADES PROGRAM,General Fund,
190219,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,"Operating, Not PreK-12",BONUSES,,,"MANAGER III, FOOD SERVICE",,,,CHARTER,,FOOD SERVICE,,,1075.02,,CONVERSION CHARTER SCHOOLS,


#### Get targets as set of one-hot encoded columns

In [7]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Setting up a train-test split  for modeling

In [8]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [9]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
# put back Total but use absolute value and impute mean for missing values

# get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

def fix_up_numbers (cols):
    rval = cols[NUMERIC_COLUMNS].copy()
    rval['Total'] = np.abs(cols['Total'])
    imp = Imputer()
    rval['Total'] = np.clip(imp.fit_transform(rval[['Total']]), 1e-5, 1e5)
    rval['FTE'] = np.zeros(cols['FTE'].shape, dtype=np.float)
    return rval

get_numeric_data_hack = FunctionTransformer(fix_up_numbers, validate=False)

#####  sb no scaler and C=0.03333 (third of a tenth)

In [10]:
#### Build the pipeline
mod_reg_033 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', get_numeric_data_hack),
                ('text_features', Pipeline([('selector', get_text_data),
                                            ('vectorizer', CountVectorizer(ngram_range=(1,2)))]))
             ])),
        # no scaler ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression(C=0.03333), n_jobs=-1))
    ])

In [11]:
start = timer()
# Fit to the training data
mod_reg_033.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

# 384 sec all procs blazing

fit time: 543.29 seconds


In [12]:
# get probas
start = timer()
mod_reg_033_train_probas = mod_reg_033.predict_proba(X_train)
mod_reg_033_test_probas = mod_reg_033.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))

Predict.proba time: 39.08 seconds


In [13]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(mod_reg_033_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(mod_reg_033_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# without scaling
# log loss on training set: 0.0743
# log loss on test set: 0.0808

log loss on training set: 0.3007
log loss on test set: 0.3026


In [14]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F1 score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))

In [15]:
# ftl wants ndarray, not pd.Dataframe
the_ys = ftl.flat_to_labels(y_test.values)

In [16]:
report_f1(the_ys, ftl.flat_to_labels(mod_reg_033_test_probas))

report_accuracy(the_ys, ftl.flat_to_labels(mod_reg_033_test_probas))

F1 score for target Function: 0.899
F1 score for target Object_Type: 0.939
F1 score for target Operating_Status: 0.956
F1 score for target Position_Type: 0.911
F1 score for target Pre_K: 0.971
F1 score for target Reporting: 0.907
F1 score for target Sharing: 0.897
F1 score for target Student_Type: 0.944
F1 score for target Use: 0.903
Average F1 score for all targets : 0.925
Accuracy score for target Function: 0.902
Accuracy score for target Object_Type: 0.940
Accuracy score for target Operating_Status: 0.958
Accuracy score for target Position_Type: 0.913
Accuracy score for target Pre_K: 0.972
Accuracy score for target Reporting: 0.910
Accuracy score for target Sharing: 0.901
Accuracy score for target Student_Type: 0.945
Accuracy score for target Use: 0.907
Average accuracy score for all targets : 0.928


#### =========================== predict on holdout set ==================================

In [17]:
# Load the holdout data: holdout
### Over here the file is TestData.csv
holdout = pd.read_csv('data/TestData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
holdout.shape, the_data.shape

((50064, 16), (400277, 25))

In [19]:
start = timer()
# Generate predictions: predictions
mod_reg_033_predictions = mod_reg_033.predict_proba(holdout)
end = timer()
print('predict time: {} seconds'.format(end - start))

predict time: 3.0139019210215565 seconds


In [23]:
pred_total_reg_033 = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
                             index=holdout.index,
                             data=mod_reg_033_predictions)

pred_total_reg_033.to_csv('mod_total_reg_033.csv')

#### score is 0.6237

####  =================== end mod_reg_033  ========================

In [21]:
thing = pd.read_csv('mod_best_total.csv', index_col=0)

In [22]:
thing.shape

(50064, 104)

***