### Enhance best scoring model by adding my feature interaction scheme.

#### Final comments on n_jobs=-1

There was reason to think that this bug was related to windows but, it works fine sometimes.
There was reason to think it had to do with CountVectorizer, but HashingVectorizer versions also hang.

Final determination is that this is an intermittent bug.  Haven't been able to find a case where I can reliably induce/remove the problem.

In the end, the only reasonable course is to not use this flag understanding that everything will take 2-3 times as long.

#### Add my feature interaction scheme

In [113]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MaxAbsScaler
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

#### My transformers

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

class IdentityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, input_array, y=None):
        return self
    
    def transform(self, input_array, y=None):
        return input_array*1


In [18]:
from itertools import combinations
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

class JustInteractions(BaseEstimator, TransformerMixin):
    ''' Takes a selection of features and returns products of all combinations (and nothing else). '''
    def __init__(self, degree=2, feature_name_separator="_"):
        self.degree = degree
        self.feature_name_separator = feature_name_separator

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not sparse.isspmatrix_csc(X):
            X = sparse.csc_matrix(X)

        if hasattr(X, "columns"):
            self.orig_col_names = X.columns
        else:
            self.orig_col_names = np.array([str(i) for i in range(X.shape[1])])

        spi = self._create_sparse_interactions(X)
        return spi

    def get_feature_names(self):
        return self.feature_names

    def _create_sparse_interactions(self, X):
        out_mat = []
        self.feature_names = self.orig_col_names.tolist()
        ### degree is always 2 so don't worry about it now.  happens once.
        for sub_degree in range(2, self.degree + 1):
            for col_ixs in combinations(range(X.shape[1]), sub_degree):
                # add name for new column
                name = self.feature_name_separator.join(self.orig_col_names[list(col_ixs)])
                self.feature_names.append(name)

                # get column multiplications value
                out = X[:, col_ixs[0]]
                for j in col_ixs[1:]:
                    out = out.multiply(X[:, j])
            
                out_mat.append(out)
        # theory: out_mat is a list of columns; I need to shstack it      
        # print(out_mat)
        # return sparse.hstack([X] + out_mat)
        # return sparse.csc_matrix(np.array(out_mat))
        return sparse.hstack(out_mat)

#### Load the data

In [2]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [3]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


#### Save the unique labels for each output (category)

In [4]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [5]:
# downsize it or not
# df = the_data.sample(frac=0.90)
df = the_data

#### Get targets as set of one-hot encoded columns

In [6]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Setting up a train-test split  for modeling

#### ======================== Begin best previous ===================================

Some things to note about the default CountVectorizer and HashingVectorizer:
1. All strings are downcased
2. The default setting selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).  This means single letter or digit tokens are ignored.
3. If the vectorizer is used to transform another input (e.g. test), any tokens not in the original corpus are ignored.

In [7]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [8]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

#### Switching from this: CountVectorizer(ngram_range=(1,2)) to this: HashingVectorizer(ngram_range=(1,2), non_negative=True). Reduces quality.   Without this flag, results track count vectorizer.

In [9]:
#### Build the pipeline
mod_1_1_2 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', get_numeric_data_hack),
                ('text_features', Pipeline([('selector', get_text_data),
                                            ('vectorizer', HashingVectorizer(ngram_range=(1,2), non_negative=True))]))
             ])),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

start = timer()
# Fit to the training data
mod_1_1_2.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

# ~500sec



fit time: 916.86 seconds


In [10]:
# get probas
start = timer()
mod_1_1_2_train_probas = mod_1_1_2.predict_proba(X_train)
mod_1_1_2_test_probas = mod_1_1_2.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))

# Predict.proba time: 24.21 seconds




Predict.proba time: 24.87 seconds


In [11]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(mod_1_1_2_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(mod_1_1_2_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

log loss on training set: 0.1083
log loss on training set: 0.1113


In [12]:
def report_f1(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = f1_score(true[:, target], pred[:, target], average='weighted')
        print('F1 score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average F1 score for all targets : {:.3f}'.format(np.mean(the_scores)))

def report_accuracy(true, pred):
    the_scores = []
    for target in range(len(LABELS)):
        the_score = accuracy_score(true[:, target], pred[:, target])
        print('Accuracy score for target {}: {:.3f}'.format(LABELS[target], the_score))
        the_scores.append(the_score)
    print('Average accuracy score for all targets : {:.3f}'.format(np.mean(the_scores)))

In [13]:
# # ftl wants ndarray, not pd.Dataframe
the_ys = ftl.flat_to_labels(y_test.values)

In [14]:
report_f1(the_ys, ftl.flat_to_labels(mod_1_1_2_test_probas))

report_accuracy(the_ys, ftl.flat_to_labels(mod_1_1_2_test_probas))

F1 score for target Function: 0.945
F1 score for target Object_Type: 0.980
F1 score for target Operating_Status: 0.982
F1 score for target Position_Type: 0.976
F1 score for target Pre_K: 0.988
F1 score for target Reporting: 0.976
F1 score for target Sharing: 0.965
F1 score for target Student_Type: 0.971
F1 score for target Use: 0.959
Average F1 score for all targets : 0.971
Accuracy score for target Function: 0.947
Accuracy score for target Object_Type: 0.981
Accuracy score for target Operating_Status: 0.983
Accuracy score for target Position_Type: 0.977
Accuracy score for target Pre_K: 0.988
Accuracy score for target Reporting: 0.976
Accuracy score for target Sharing: 0.965
Accuracy score for target Student_Type: 0.971
Accuracy score for target Use: 0.960
Average accuracy score for all targets : 0.972


#### =============================== End of mod_1_1_2 ============================================

#### ======================= Begin clf_besties (my feature interactions) ========================================

In [21]:
clf_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 100)),
                                               ('interact', JustInteractions())]))])),        # 4.5k new features
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [22]:
start = timer()
# Fit to the training data
clf_besties.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

### ran extra slow because machine was optimizing battery not performance.

fit time: 4804.77 seconds


In [23]:
# get probas
start = timer()
clf_besties_train_probas = clf_besties.predict_proba(X_train)
clf_besties_test_probas = clf_besties.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))

Predict.proba time: 37.75 seconds


In [24]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(clf_besties_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(clf_besties_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

log loss on training set: 0.0448
log loss on test set: 0.0570


#### Slight improvement over previous best local log loss

#### ===========================================================================================

#### ================== Begin clf_besties_s (my feature interactions, with scaling) =======================================

#### Add scaling

In [44]:
clf_besties_s = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 100)),
                                               ('interact', JustInteractions())]))])),        # 4.5k new features
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [45]:
start = timer()
# Fit to the training data
clf_besties_s.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

fit time: 1656.17 seconds


In [46]:
# get probas
start = timer()
clf_besties_s_train_probas = clf_besties_s.predict_proba(X_train)
clf_besties_s_test_probas = clf_besties_s.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))


Predict.proba time: 32.33 seconds


In [47]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(clf_besties_s_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(clf_besties_s_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

# scaling speeds things up, but maybe hurts prediction.

log loss on training set: 0.0482
log loss on test set: 0.0587


#### Scaling doesn't help

#### =====================================================================================================

#### ======================= Begin twoC_besties (same, but 200 features)==========================================

#### Add another 100 features

In [48]:
twoC_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 200)),
                                               ('interact', JustInteractions())]))])),        # 20k new features
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [49]:
start = timer()
# Fit to the training data
twoC_besties.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

fit time: 6530.86 seconds


In [62]:
# get probas
start = timer()
twoC_besties_train_probas = twoC_besties.predict_proba(X_train)
twoC_besties_test_probas = twoC_besties.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))


Predict.proba time: 69.10 seconds


In [63]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(twoC_besties_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(twoC_besties_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

log loss on training set: 0.0427
log loss on test set: 0.0565


In [65]:
yhat2C = ftl.flat_to_labels(twoC_besties_test_probas)

In [64]:
report_f1(the_ys, ftl.flat_to_labels(twoC_besties_test_probas))
report_accuracy(the_ys, ftl.flat_to_labels(twoC_besties_test_probas))

F1 score for target Function: 0.965
F1 score for target Object_Type: 0.987
F1 score for target Operating_Status: 0.987
F1 score for target Position_Type: 0.987
F1 score for target Pre_K: 0.991
F1 score for target Reporting: 0.984
F1 score for target Sharing: 0.980
F1 score for target Student_Type: 0.981
F1 score for target Use: 0.976
Average F1 score for all targets : 0.982
Accuracy score for target Function: 0.965
Accuracy score for target Object_Type: 0.987
Accuracy score for target Operating_Status: 0.987
Accuracy score for target Position_Type: 0.987
Accuracy score for target Pre_K: 0.991
Accuracy score for target Reporting: 0.984
Accuracy score for target Sharing: 0.980
Accuracy score for target Student_Type: 0.981
Accuracy score for target Use: 0.976
Average accuracy score for all targets : 0.982


### Best so far.

In [134]:
holdout.dtypes

Object_Description         object
Program_Description        object
SubFund_Description        object
Job_Title_Description      object
Facility_or_Department     object
Sub_Object_Description     object
Location_Description       object
FTE                       float64
Function_Description       object
Position_Extra             object
Text_4                     object
Total                     float64
Text_2                     object
Text_3                     object
Fund_Description           object
Text_1                     object
dtype: object

In [133]:
holdout.head()

Unnamed: 0,Object_Description,Program_Description,SubFund_Description,Job_Title_Description,Facility_or_Department,Sub_Object_Description,Location_Description,FTE,Function_Description,Position_Extra,Text_4,Total,Text_2,Text_3,Fund_Description,Text_1
180042,Student Meals/Room/Other,Basic Educational Services,,,,Line Item that is paid with Campus' money,School,,Instruction,,,3999.91,,,General Fund,
28872,Extra Duty/Signing Bonus Pay,Undistributed,,CHEERLEADER DIR,,General,School,,Cocurricular & Extra Curricular Activities,,,3447.320213,,,General Fund,
186915,Professional Salaries,Bilingual Education,,T-EL 1ST BIL,,General,School,1.0,Instruction,,,52738.780869,,,General Fund,
412396,Professional Salaries,Bilingual Education,,T-EL 2ND BIL,,General,School,1.0,Instruction,,,69729.263191,,,General Fund,
427740,Salaries for Support Personnel,Undistributed,,CLERK III- SCH,,General,School,1.0,School Leadership,,,29492.834215,,,General Fund,


In [132]:
# Load the holdout data: holdout
### Over here the file is TestData.csv
holdout = pd.read_csv('data/TestData.csv', index_col=0)

start = timer()
# Generate predictions: predictions
twoC_besties_predictions = twoC_besties.predict_proba(holdout)
end = timer()
print('predict time: {} seconds'.format(end - start))

pred_twoC_besties = pd.DataFrame(columns=pd.get_dummies(df[LABELS], prefix_sep='__').columns, 
                             index=holdout.index,
                             data=twoC_besties_predictions)

pred_twoC_besties.to_csv('pred_twoC_besties.csv')

  interactivity=interactivity, compiler=compiler, result=result)


predict time: 13.065332858008333 seconds


#### ================================== Score is 0.7373 ===========================================================

#### =============================== Begin three_C_besties ============================================

In [35]:
threeC_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 300)),
                                               ('interact', JustInteractions())]))])),        # 45k new features
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [36]:
start = timer()
# Fit to the training data
threeC_besties.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

fit time: 12157.35 seconds


In [37]:
# get probas
start = timer()
threeC_besties_train_probas = threeC_besties.predict_proba(X_train)
threeC_besties_test_probas = threeC_besties.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))


Predict.proba time: 112.65 seconds


In [38]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(threeC_besties_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(threeC_besties_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

log loss on training set: 0.0412
log loss on test set: 0.0566


In [39]:
# # ftl wants ndarray, not pd.Dataframe
the_ys = ftl.flat_to_labels(y_test.values)

In [41]:
report_f1(the_ys, ftl.flat_to_labels(threeC_besties_test_probas))

report_accuracy(the_ys, ftl.flat_to_labels(threeC_besties_test_probas))

# F1 score for target Function: 0.965
# F1 score for target Object_Type: 0.987
# F1 score for target Operating_Status: 0.987
# F1 score for target Position_Type: 0.987
# F1 score for target Pre_K: 0.991
# F1 score for target Reporting: 0.984
# F1 score for target Sharing: 0.979
# F1 score for target Student_Type: 0.981
# F1 score for target Use: 0.975
# Average F1 score for all targets : 0.982
# Accuracy score for target Function: 0.965
# Accuracy score for target Object_Type: 0.987
# Accuracy score for target Operating_Status: 0.987
# Accuracy score for target Position_Type: 0.987
# Accuracy score for target Pre_K: 0.991
# Accuracy score for target Reporting: 0.984
# Accuracy score for target Sharing: 0.979
# Accuracy score for target Student_Type: 0.981
# Accuracy score for target Use: 0.975
# Average accuracy score for all targets : 0.982

F1 score for target Function: 0.966
F1 score for target Object_Type: 0.987
F1 score for target Operating_Status: 0.987
F1 score for target Position_Type: 0.987
F1 score for target Pre_K: 0.991
F1 score for target Reporting: 0.984
F1 score for target Sharing: 0.980
F1 score for target Student_Type: 0.981
F1 score for target Use: 0.976
Average F1 score for all targets : 0.982
Accuracy score for target Function: 0.966
Accuracy score for target Object_Type: 0.987
Accuracy score for target Operating_Status: 0.987
Accuracy score for target Position_Type: 0.987
Accuracy score for target Pre_K: 0.991
Accuracy score for target Reporting: 0.984
Accuracy score for target Sharing: 0.980
Accuracy score for target Student_Type: 0.981
Accuracy score for target Use: 0.976
Average accuracy score for all targets : 0.982


#### Not better.

***

#### Use RF to get 200 best features before making the interactions...

In [128]:
rfTwoC_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # the word vecs
        ('get_interact',             Pipeline([('get_best', SelectFromModel(RandomForestClassifier(n_jobs=-1), 
                                                                            threshold = 0.00097845200)),
                                               ('interact', JustInteractions())]))])),        # 20k new features
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

#### We need to know what the threshold is to get only 200 features

In [117]:
# make a classifier
rfcp = Pipeline([('selector', get_text_data),
                ('vectorizer', CountVectorizer(ngram_range=(1,2))),
                ('rf', RandomForestClassifier(n_jobs=-1))])

In [None]:
# fit it

In [118]:
rfcp.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function combine_text_columns at 0x000001C2CEC15EA0>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('vectorizer', CountVectorizer(analyzer='word', binary=Fals..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [121]:
# get the feature importances
f_i = rfcp.steps[-1][1].feature_importances_

In [125]:
# sort the importances 
fis = np.sort(f_i)

In [126]:
# look at importances of the 200th most important feature and neighbors.
fis[-210: -200]

array([0.00093152, 0.00094927, 0.00095214, 0.00095267, 0.00095563,
       0.00096494, 0.00097227, 0.00097718, 0.00097829, 0.00097845])

#### importance of the 200th most important is  0.00097845, so that's our threshold

In [129]:
start = timer()
# Fit to the training data
rfTwoC_besties.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

fit time: 12908.84 seconds


In [130]:
# get probas
start = timer()
rfTwoC_besties_train_probas = rfTwoC_besties.predict_proba(X_train)
rfTwoC_besties_test_probas = rfTwoC_besties.predict_proba(X_test)
end = timer()
print('Predict.proba time: {:0.2f} seconds'.format(end - start))


Predict.proba time: 94.50 seconds


In [131]:
print('log loss on training set: {:0.4f}'.format(multi_multi_log_loss(rfTwoC_besties_train_probas, 
                                                                      y_train.values, BOX_PLOTS_COLUMN_INDICES)))
print('log loss on test set: {:0.4f}'.format(multi_multi_log_loss(rfTwoC_besties_test_probas, 
                                                                      y_test.values, BOX_PLOTS_COLUMN_INDICES)))

# log loss on training set: 0.0467
# log loss on training set: 0.0573

log loss on training set: 0.0405
log loss on test set: 0.0565


#### Better on training set but not on test set.  

I wonder: could I be overfit?  Best on LB is 0.35, so maybe I should dial up the bias until I'm there and try.

In [65]:
yhat2C = ftl.flat_to_labels(rfTwoC_besties_test_probas)

In [64]:
report_f1(the_ys, ftl.flat_to_labels(rfTwoC_besties_test_probas))

report_accuracy(the_ys, ftl.flat_to_labels(rfTwoC_besties_test_probas))

# F1 score for target Function: 0.965
# F1 score for target Object_Type: 0.987
# F1 score for target Operating_Status: 0.987
# F1 score for target Position_Type: 0.987
# F1 score for target Pre_K: 0.991
# F1 score for target Reporting: 0.984
# F1 score for target Sharing: 0.979
# F1 score for target Student_Type: 0.981
# F1 score for target Use: 0.975
# Average F1 score for all targets : 0.982
# Accuracy score for target Function: 0.965
# Accuracy score for target Object_Type: 0.987
# Accuracy score for target Operating_Status: 0.987
# Accuracy score for target Position_Type: 0.987
# Accuracy score for target Pre_K: 0.991
# Accuracy score for target Reporting: 0.984
# Accuracy score for target Sharing: 0.979
# Accuracy score for target Student_Type: 0.981
# Accuracy score for target Use: 0.975
# Average accuracy score for all targets : 0.982

F1 score for target Function: 0.965
F1 score for target Object_Type: 0.987
F1 score for target Operating_Status: 0.987
F1 score for target Position_Type: 0.987
F1 score for target Pre_K: 0.991
F1 score for target Reporting: 0.984
F1 score for target Sharing: 0.980
F1 score for target Student_Type: 0.981
F1 score for target Use: 0.976
Average F1 score for all targets : 0.982
Accuracy score for target Function: 0.965
Accuracy score for target Object_Type: 0.987
Accuracy score for target Operating_Status: 0.987
Accuracy score for target Position_Type: 0.987
Accuracy score for target Pre_K: 0.991
Accuracy score for target Reporting: 0.984
Accuracy score for target Sharing: 0.980
Accuracy score for target Student_Type: 0.981
Accuracy score for target Use: 0.976
Average accuracy score for all targets : 0.982


### Best local log loss so far.

#### ======================================================================================================

#### Looking at how to get operating status...

#### 9632 in test ys with this flag

In [66]:
(df.loc[y_test.index, :]['Operating_Status'] == 'Non-Operating').sum()

9632

In [105]:
dftest = df.loc[y_test.index, :]

In [70]:
type(yhat2C), yhat2C.shape

(numpy.ndarray, (80055, 9))

In [71]:
LABELS

['Function',
 'Object_Type',
 'Operating_Status',
 'Position_Type',
 'Pre_K',
 'Reporting',
 'Sharing',
 'Student_Type',
 'Use']

In [142]:
df.loc[y_test.index, :]['Operating_Status'].values[:5].astype(np.object)

array(['Non-Operating', 'PreK-12 Operating', 'PreK-12 Operating',
       'PreK-12 Operating', 'Non-Operating'], dtype=object)

In [141]:
yh2c['Operating_Status'].values[:5]

array(['Non-Operating', 'PreK-12 Operating', 'PreK-12 Operating',
       'PreK-12 Operating', 'Non-Operating'], dtype=object)

#### 79k/80k agree

In [146]:
(df.loc[y_test.index, :]['Operating_Status'].values.astype(np.object) == yh2c['Operating_Status'].values).sum()

79021

In [72]:
# how many in yhat?
(yhat2C[:, 2] == 'Non-Operating').sum()

9511

In [73]:
9511/9632

0.987437707641196

In [98]:
yh2c = pd.DataFrame(data=yhat2C, columns=LABELS)

In [99]:
yhat2C.shape

(80055, 9)

In [100]:
yh2c.shape

(80055, 9)

In [101]:
yh2c.head()

Unnamed: 0,Function,Object_Type,Operating_Status,Position_Type,Pre_K,Reporting,Sharing,Student_Type,Use
0,NO_LABEL,NO_LABEL,Non-Operating,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL
1,Substitute Compensation,Benefits,PreK-12 Operating,Substitute,NO_LABEL,School,School Reported,Unspecified,Instruction
2,Teacher Compensation,Benefits,PreK-12 Operating,Teacher,NO_LABEL,School,School Reported,Unspecified,Instruction
3,Instructional Materials & Supplies,Supplies/Materials,PreK-12 Operating,NO_LABEL,NO_LABEL,School,School Reported,NO_LABEL,Instruction
4,NO_LABEL,NO_LABEL,Non-Operating,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL


In [84]:
LABELS = ['Function',
 'Object_Type',
 'Operating_Status',
 'Position_Type',
 'Pre_K',
 'Reporting',
 'Sharing',
 'Student_Type',
 'Use']

In [88]:
no_op_status = (LABELS.copy())

In [89]:
no_op_status.remove('Operating_Status')

In [86]:
LABELS

['Function',
 'Object_Type',
 'Operating_Status',
 'Position_Type',
 'Pre_K',
 'Reporting',
 'Sharing',
 'Student_Type',
 'Use']

In [90]:
no_op_status

['Function',
 'Object_Type',
 'Position_Type',
 'Pre_K',
 'Reporting',
 'Sharing',
 'Student_Type',
 'Use']

In [106]:
((dftest[dftest['Operating_Status'] == 'Non-Operating'])[no_op_status] == 'NO_LABEL').sum()

Function         9632
Object_Type      9632
Position_Type    9632
Pre_K            9632
Reporting        9632
Sharing          9632
Student_Type     9632
Use              9632
dtype: int64

In [107]:
9462/9632

0.9823504983388704

In [103]:
((yh2c[yh2c['Operating_Status'] == 'Non-Operating'])[no_op_status] == 'NO_LABEL').sum()

Function         9469
Object_Type      9462
Position_Type    9490
Pre_K            9480
Reporting        9480
Sharing          9482
Student_Type     9476
Use              9479
dtype: int64

#### Looks like there's not a lot to be gained here...  Unless there's a feature that is exactly correlated with Operating_Status.

In [110]:
df[df['Operating_Status'] == 'Non-Operating'].head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
304569,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,EQUIPMENT *,,Support Services - Administration,,,,Equipment *,,,State and Federal Projects Coordination Services*,,,-5509.32,,School to Work,ITEMAI-CITY-YTH EMP & OPP
64760,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,Regular *,,Special Instruction,,,,Certificated Employees Salaries And Wages,,,Disadvantaged Youth *,,,-122544.07,,Title I - Disadvantaged Children/Targeted Assi...,TITLE I
43727,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,Non-Certificated Travel Reimbursement,,Support Services - Instructional Staff,,,,Travel Mileage/Meeting Expense *,,,Instruction And Curriculum Development Services *,,,-446.11,,Special Trust,RESP SEXUAL BEHAVIOR
5614,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,Purchased Services,,Support Services--Pupils,,,,Purchased Services,,,Direction Of Support Services - Pupils *,,,550.31,,Special Trust,HOME TEAM MARKETING


In [111]:
df.iloc[304569, :]

Function                  Parent & Community Relations
Use                        Pupil Services & Enrichment
Sharing                        Leadership & Management
Reporting                                   Non-School
Student_Type                               Unspecified
Position_Type                                    Other
Object_Type                                   Benefits
Pre_K                                         NO_LABEL
Operating_Status                     PreK-12 Operating
Object_Description                 RETIREMENT CONTRIB.
Text_2                                             NaN
SubFund_Description                       GENERAL FUND
Job_Title_Description                Americorp Hourly 
Text_3                                             NaN
Text_4                                             NaN
Sub_Object_Description                             NaN
Location_Description      DISTRICT COMMUNITY RELATIONS
FTE                                                NaN
Function_D

#### =====================================================================================================