### Using 1_1_2, fit on the whole dataset, then predict on holdout and submit.

#### This works for any size fraction of the data <= 90%.  Somewhere between there and 100% there's a problem.  Seems to do the fitting (heavily loaded then not) but doesn't come back.  Also happens in ipython (i.e. without the notebook).

#### Okay.  Here's a workaround: leave out the n_jobs=-1.  Trying it.  

#### Yes, works fine, but takes close to half an hour.  This may be better than the best so far.

In [1]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

#### Load the data

In [2]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [3]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


#### Save the unique labels for each output (category)

In [4]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [5]:
# downsize it or not
# df = the_data.sample(frac=0.90)
df = the_data

#### Get targets as set of one-hot encoded columns

In [6]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Setting up a train-test split  for modeling

#### ======================== Begin Mod1_1_1; add bigrams ===================================

Some things to note about the default CountVectorizer:
1) All strings are downcased
2) The default setting selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).  This means single letter or digit tokens are ignored.
3) If the vectorizer is used to transform another input (e.g. test), any tokens not in the original corpus are ignored.

#### One way to work around bug exposed with CountVectorizer/OneVsRest/Logistic is to replace all the numeric values with 0.  The classifiers  should ignore (and might work with n_jobs=-1).

Yes, this works well and uses all processors yielding the same results as the slower, 1-processor version above.  Fits in 464 sec instead of 827 sec.

In [7]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

#### Don't need X_train, etc.  We're fitting on the whole dataset.

In [8]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

#### This is the same as mod3_1, the best scoring model.  There are two differences:
1) this is fit against the whole data, not just a test set
2) this one doesn't use n_jobs=-1 in the OneVsRest (because it doesn't seem to work anymore; just hangs after doing the fitting).

This one took 2k sec; other one was 800 sec.  

In [9]:
#### Build the pipeline
mod_1_1_2 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([('selector', get_numeric_data_hack),
                                               ('imputer', Imputer())])),
                ('text_features', Pipeline([('selector', get_text_data),
                                            ('vectorizer', CountVectorizer(ngram_range=(1,2)))]))
             ])),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

start = timer()
# Fit to the training data
mod_1_1_2.fit(df[NON_LABELS], dummy_labels)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

fit time: 1994.57 seconds


#### When it's done, get holdout, predict probabilites and write out.

#### For log loss we need the probabilities, not the predicted labels

In [11]:
holdout = pd.read_csv('data/TestData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
start = timer()
# Generate predictions: predictions
pred_1_1_2 = mod_1_1_2.predict_proba(holdout)
end = timer()
print('predict time: {:0.2f} seconds'.format(end - start))

prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
                             index=holdout.index,
                             data=pred_1_1_2)

prediction_df.to_csv('sub_1_1_2.csv')

predict time: 2.58 seconds


#### Did not beat prior best.  This scored 0.6676, just a tiny bit worse. 

#### =============================== End of mod_1_1_2 ============================================

***