In [2]:
# from __future__ import division
from __future__ import print_function
%matplotlib inline

# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys

src_dir=os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from data.multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from features.SparseInteractions import SparseInteractions
from models.metrics import multi_multi_log_loss

# Load Data
First, we'll load the entire training data set available from DrivenData. In order to make this notebook run, you will need to:

Sign up for an account on DrivenData
Join the Box-plots for education competition
Download the competition data to the data folder in this repository. Files should be named TrainingSet.csv and TestSet.csv.
Enjoy!

In [8]:
path_to_training_data = os.path.join(os.pardir, 'data', 'TrainingData.csv')
path_to_test_data = os.path.join(os.pardir,'data','TestData.csv')

'../data/TrainingData.csv'

In [23]:
df = pd.read_csv(path_to_training_data, index_col=0)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 25 columns):
Function                  400277 non-null object
Use                       400277 non-null object
Sharing                   400277 non-null object
Reporting                 400277 non-null object
Student_Type              400277 non-null object
Position_Type             400277 non-null object
Object_Type               400277 non-null object
Pre_K                     400277 non-null object
Operating_Status          400277 non-null object
Object_Description        375493 non-null object
Text_2                    88217 non-null object
SubFund_Description       306855 non-null object
Job_Title_Description     292743 non-null object
Text_3                    109152 non-null object
Text_4                    53746 non-null object
Sub_Object_Description    91603 non-null object
Location_Description      162054 non-null object
FTE                       126071 non-null float64
Func

In [15]:
len(dt.columns)

16

In [103]:
df.shape

(400277, 25)

For our purpose and computational efficiency, we will sample down to 10,000 rows from 400,277rows so that it's easy to quick to run our analysis. We will also create dummy variables for our labels and split our sampel dataset into a training set and a test set.

In [19]:
LABELS = ['Function',
          'Use',
          'Sharing',
          'Reporting',
          'Student_Type',
          'Position_Type',
          'Object_Type', 
          'Pre_K',
          'Operating_Status']
NON_LABELS = [c for c in df.columns if c not in LABELS]
SAMPLE_SIZE = 40000
sampling = multilabel_sample_dataframe(df, 
                            pd.get_dummies(df[LABELS]),
                            size = SAMPLE_SIZE,
                            min_count=25,
                            seed=43)

dummy_labels = pd.get_dummies(sampling[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(sampling[NON_LABELS],
                                                               dummy_labels,
                                                               0.2,
                                                               min_count=3,
                                                               seed=43)

# Create preprocessing tools¶
We need tools to preprocess our text and numeric data. We'll create those tools here. The combine_text_columns function will take a DataFrame of text columns and return a single series where all of the text in the columns has been joined together.

We'll then create FunctionTransformer objects that select our text and numeric data from the dataframe.

Finally, we create a custom scoring method that uses the multi_multi_log_loss function that is the evaluation metric for the competition.

In [62]:
NUMERIC_COLUMNS = ['FTE', 'Total']

def combine_text_columns(data_frame, to_drop = NUMERIC_COLUMNS + LABELS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace NaN with Blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1) with a space in between
    return text_data.apply(lambda x: " ".join(x), axis =1)

In [None]:
combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS)

In [89]:
from sklearn.preprocessing import FunctionTransformer

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [91]:
get_text_data.fit_transform(sampling.head(5))
get_numeric_data.fit_transform(sampling.head(5))

38     OTHER PURCHASED SERVICES  SCHOOL-WIDE SCHOOL P...
70     Extra Duty Pay/Overtime For Support Personnel ...
198    Supplemental *  Operation and Maintenance of P...
209    REPAIR AND MAINTENANCE SERVICES  PUPIL TRANSPO...
614     GENERAL EDUCATION LOCAL EDUCATIONAL AIDE,70 H...
dtype: object

In [88]:
from sklearn.metrics.scorer import make_scorer
log_loss_scorere = make_scorer(multi_multi_log_loss)

# TRAIN MODEL PIPELINE
Now we'll train the final pipeline from the course that takes text and numeric data, does the necessary preprocessing, and trains the classifier.

In [93]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MaxAbsScaler

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [97]:
%%time

# set a reasonable number of features before adding interactions
chi_k = 300

#create the pipeline object
pl = Pipeline([
        ("union", FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('Selector', get_numeric_data),
                    ('imputer', Imputer())
                ])), 
                ('text_features', Pipeline([
                    ('selector',get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                    non_negative=True, norm=None, binary=False,
                                                    ngram_range=(1,2))),
                    ('dim_red',SelectKBest(chi2, chi_k))
                ]))
            ]))

        ])
               
               



TypeError: __init__() got an unexpected keyword argument 'non_negative'