### Develop a transformer that will compute interactions of best features and combine with all the input features (not just the k best).

#### Just pull in a little bit of data and vectorize it, so I can see what it is.

In [2]:
#### Imports/setup

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 60)

from timeit import default_timer as timer

# for the pipeline
from sklearn.pipeline import Pipeline
# for the selectors
from sklearn.preprocessing import FunctionTransformer, StandardScaler
# for gluing preprocessed text and numbers together
from sklearn.pipeline import FeatureUnion
# for nans in the numeric data
from sklearn.preprocessing import Imputer

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

# metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report

# unflattener
import python.flat_to_labels as ftl

#### Set up a train-test split making sure we have all labels in both splits
from python.multilabel import multilabel_train_test_split

from python.dd_mmll import multi_multi_log_loss, BOX_PLOTS_COLUMN_INDICES

In [26]:
from sklearn.feature_selection import SelectKBest, chi2

#### My transformers

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class IdentityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, input_array, y=None):
        return self
    
    def transform(self, input_array, y=None):
        return input_array*1

In [45]:
from itertools import combinations
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

class JustInteractions(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, feature_name_separator="_"):
        self.degree = degree
        self.feature_name_separator = feature_name_separator

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not sparse.isspmatrix_csc(X):
            X = sparse.csc_matrix(X)

        if hasattr(X, "columns"):
            self.orig_col_names = X.columns
        else:
            self.orig_col_names = np.array([str(i) for i in range(X.shape[1])])

        spi = self._create_sparse_interactions(X)
        return spi

    def get_feature_names(self):
        return self.feature_names

    def _create_sparse_interactions(self, X):
        out_mat = []
        self.feature_names = self.orig_col_names.tolist()
        ### degree is always 2 so don't worry about it now.  happens once.
        for sub_degree in range(2, self.degree + 1):
            for col_ixs in combinations(range(X.shape[1]), sub_degree):
                # add name for new column
                name = self.feature_name_separator.join(self.orig_col_names[list(col_ixs)])
                self.feature_names.append(name)

                # get column multiplications value
                out = X[:, col_ixs[0]]
                for j in col_ixs[1:]:
                    out = out.multiply(X[:, j])
            
                out_mat.append(out)
        # theory: out_mat is a list of columns; I need to shstack it      
        # print(out_mat)
        # return sparse.hstack([X] + out_mat)
        # return sparse.csc_matrix(np.array(out_mat))
        return sparse.hstack(out_mat)

#### Load the data

In [3]:
# Get data
the_data = pd.read_csv('data/TrainingData.csv', index_col=0)

# take a look
the_data.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


####  Encode the targets as categorical variables

In [5]:
### bind variable LABELS - these are actually the targets and we're going to one-hot encode them...
LABELS = ['Function',  'Use',  'Sharing',  'Reporting',  'Student_Type',  'Position_Type', 
          'Object_Type',  'Pre_K',  'Operating_Status']

### This turns out to be key.  Submission requires the dummy versions of these vars to be in this order.
LABELS.sort()

# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
the_data[LABELS] = the_data[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(the_data[LABELS].dtypes)

Function            category
Object_Type         category
Operating_Status    category
Position_Type       category
Pre_K               category
Reporting           category
Sharing             category
Student_Type        category
Use                 category
dtype: object


#### Save the unique labels for each output (category)

In [6]:
# build a dictionary
the_labels = {col : the_data[col].unique().tolist() for col in the_data[LABELS].columns}
# take a look at one entry
the_labels['Use']

['Instruction',
 'NO_LABEL',
 'O&M',
 'Pupil Services & Enrichment',
 'ISPD',
 'Leadership',
 'Business Services',
 'Untracked Budget Set-Aside']

#### Change fraction to suit.
Note: small fractions will have a hard time ensuring labels in both splits.

In [7]:
# downsize it or not
df = the_data.sample(frac=0.01)
# df = the_data

#### Get targets as set of one-hot encoded columns

In [8]:
# name these columns
NUMERIC_COLUMNS = ['FTE', 'Total']

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

#### Don't set up a train-test split 

In [9]:
# define combine_text_columns()
def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text columns in each row of df to single string """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)  
    # Replace nans with blanks
    text_data.fillna('', inplace=True)    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [12]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the features in the data
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
# X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
#                                                                dummy_labels,
#                                                                0.2, 
#                                                                seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Use all 0s instead of noise: get_numeric_data
get_numeric_data_hack = FunctionTransformer(lambda x: np.zeros(x[NUMERIC_COLUMNS].shape, dtype=np.float), validate=False)

#### Actually, let's use CountVect to do this because it'll be maybe easier to look at.

Have to keep in mind here that input to SparseInteractions is output from SelectKBest not a vectorizer.

In [21]:
# make a pipeline to grab and vectorize
v_pipe = Pipeline([('get',      get_text_data),
                   ('wordvecs', CountVectorizer(ngram_range=(1,2)))])

In [22]:
the_vecs = v_pipe.fit_transform(df)

In [23]:
type(the_vecs)

scipy.sparse.csr.csr_matrix

In [24]:
### it's that wide because of the bigrams.
the_vecs 

<4003x8327 sparse matrix of type '<class 'numpy.int64'>'
	with 132779 stored elements in Compressed Sparse Row format>

In [17]:
### 33 tokens per row
132778/4003

33.16962278291282

#### So what comes out of vectorizer is CSR format.  Let's look at what comes out of SelectKBest...

In [27]:
sel_pipe = Pipeline([('get',      get_text_data),
                     ('wordvecs', CountVectorizer(ngram_range=(1,2))),
                     ('sel100', SelectKBest(chi2, 24))])

In [30]:
my_k_select=SelectKBest(chi2, 24)

In [35]:
da_ys =  pd.get_dummies(df[LABELS])

In [34]:
my_k_select.fit_transform(the_vecs, pd.get_dummies(df[LABELS]))

<4003x24 sparse matrix of type '<class 'numpy.int64'>'
	with 255 stored elements in Compressed Sparse Row format>

In [29]:
sel_pipe.steps

[('get', FunctionTransformer(accept_sparse=False,
            func=<function combine_text_columns at 0x000001E49EB75E18>,
            inv_kw_args=None, inverse_func=None, kw_args=None,
            pass_y='deprecated', validate=False)),
 ('wordvecs',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 2), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('sel100',
  SelectKBest(k=24, score_func=<function chi2 at 0x000001E49D24A510>))]

In [36]:
the_24_best = sel_pipe.fit_transform(df, da_ys)

In [37]:
the_24_best

<4003x24 sparse matrix of type '<class 'numpy.int64'>'
	with 255 stored elements in Compressed Sparse Row format>

In [62]:
len(list(combinations(range(10), 2)))

45

#### Now build my pipeline

In [49]:
blah =  Pipeline([('get_best', SelectKBest(chi2, 10)),
                  ('interact', JustInteractions())])

In [59]:
add_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # should be the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 10)),
                                               ('interact', JustInteractions())]))]))        # should be combos of 10 (45)
    ])

In [60]:
my_features = add_besties.fit_transform(df, da_ys)

In [61]:
my_features

<4003x8372 sparse matrix of type '<class 'numpy.int64'>'
	with 132797 stored elements in Compressed Sparse Row format>

#### That might have worked.  How many features do I get from CV?  8327.  How many 2 combos from 10?  45.

In [63]:
# 4003x8327 
8327 + 45

8372

### Yes!  It's at least the right shape...

In [None]:
# #### Build the pipeline
# mod_1_1_2 = Pipeline([
#         ('union', FeatureUnion(
#             transformer_list = [
#                 ('numeric_features', get_numeric_data_hack),
#                 ('text_features', Pipeline([('selector', get_text_data),
#                                             ('vectorizer', HashingVectorizer(ngram_range=(1,2)))]))
#              ])),
#         ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
#     ])

In [10]:
# start = timer()
# # Fit to the training data
# mod_1_1_2.fit(X_train, y_train)
# end = timer()
# print('fit time: {:0.2f} seconds'.format(end - start))

# # ~500sec

#### Okay, I have my JustInteractions transformer ready.

In [64]:
clf_besties = Pipeline([
    ('text_features', Pipeline([('selector', get_text_data),
                                ('vectorizer', CountVectorizer(ngram_range=(1,2)))])),
    ('union', FeatureUnion(transformer_list = [
        ('passthrough',              IdentityTransformer()),                                 # should be the word vecs
        ('get_interact',             Pipeline([('get_best', SelectKBest(chi2, 100)),
                                               ('interact', JustInteractions())]))])),        # should be combos of 10 (45)
    ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
    ])

In [65]:
start = timer()
# Fit to the training data
clf_besties.fit(X_train, y_train)
end = timer()
print('fit time: {:0.2f} seconds'.format(end - start))

# ~500sec

fit time: 13.03 seconds


In [67]:
clf_besties.score(df, da_ys)

0.9485385960529603

In [68]:
yhats = clf_besties.predict(df)

In [72]:
type(yhats), type(da_ys)

(numpy.ndarray, pandas.core.frame.DataFrame)

In [73]:
multi_multi_log_loss(yhats, da_ys.values, BOX_PLOTS_COLUMN_INDICES)

0.3876768939954642