In [57]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor
pd.options.mode.chained_assignment = None  # default='warn'


# easy way of accessing A_6, A_7, ... A_N columns
def column_list(letter, start, end):
    return ["%s%d" % (letter, i) for i in range(start, end)]


def remove_outliers(column, target, first, second):
    non_outliers = target.between(target.quantile(first), target.quantile(second))

    for index in range(0, len(column)):
        if ~non_outliers[index]:
            column.drop(index, inplace=True)


# convert strings to int type even if it's a float
# replace by median or mean?
def convert_stat(x, new_value=0):
    if not isinstance(x, int):

        if not isinstance(x, float) and '.' not in x:
            return new_value if x == "TRANSFER" else int(x)
        else:
            return new_value if x == "TRANSFER" else int(float(x))

    else:
        return x


class SumTransformer(BaseEstimator):

    # set new_value to None if Pipeline contains SimpleImputer
    # this is for absences and tardies since somet students are
    # transfer students. The placeholder in the CSV is the string "TRANSFER"
    def __init__(self, new_value=0, bins=1):
        self.new_value = new_value
        self.bins = bins

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        # change to for i in ["A", "T"] to include tardies if need be
        for i in ["A"]:

            # corrects the values in the data frame that will be used in the training models
            for j in column_list(i, 6, 9):
                df[j] = df[j].apply(convert_stat, self.new_value)
          #      df[j] = np.array(np.floor(np.array(df[j]) / float(self.bins)))

        return df


After setting up the functions needed for pre-processing, I set up the data. There are necessary pre-processing
*before* the actual pre-processing. Because some numerical columns contain string types, it is necessary
to convert them. Also, outliers are determined before the split (or pipeline), and removed.

In [65]:
student_data = pd.read_csv("../data/High School East Student Data - Sheet1.csv")
features = ["A6", "A7", "A8"]
student_data["AbsencesSum_HS"] = 0

# Pipeline doesn't allow transformations on the target label
# so I have to do transformations outside of Pipeline in order
# to sum all absences in High School for each student.
for j in column_list("A", 9, 13):
    student_data[j] = student_data[j].apply(convert_stat)

student_data["AbsencesSum_HS"] = student_data[column_list('A', 9, 13)].sum(axis=1)

# because we've created the total absences in high school column
# we are now able to eliminate outliers in the dataset.
remove_outliers(student_data, student_data["AbsencesSum_HS"], 0, 0.95)

pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('categories', OneHotEncoder(), ["Gender", "IEP/Specialized"])])

def run_test(model, data, target, bins=1, model_name="model"):
    pipeline = Pipeline(steps=[('number_fix', SumTransformer(bins=bins)),
                                # ('pre_process', pre_process),
                                 (model_name, model)
                                 ])
    scores = -1 * cross_val_score(pipeline,
                              data[features],
                              data[target],
                              cv=5,
                              scoring='neg_mean_absolute_error')
    return scores

### Sorta like unit testing but in jupyter
test = run_test(DecisionTreeRegressor(random_state=1), student_data, "AbsencesSum_HS")
prior_run = np.array([23.43076923, 16.23076923, 15.76923077, 17.93846154, 19.58333333])

if not np.allclose(test, prior_run, atol=0.001):
    raise Exception("Unintended modification to pre-processing!!! Fix it!!")

[23.43076923 16.23076923 15.76923077 17.93846154 19.58333333]


# 2. Feature Engineering Tests
## 2.1 Binning the Data
https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b

In [60]:
import copy as cp

data_copy = cp.deepcopy(student_data)

#comparing fixed width binning and dynamic width binning
data_copy["HS_AB_FIXED"] = np.array(np.floor(np.array(data_copy["AbsencesSum_HS"]) / 5.))

np.array(run_test(DecisionTreeRegressor(random_state=1), data_copy, "HS_AB_FIXED", bins=5))

array([4.05128205, 3.87179487, 3.15384615, 3.71794872, 5.08333333])

## 2.2

The first model I will test is the Decision Tree Regressor.

In [61]:
tree = DecisionTreeRegressor(random_state=1)

"""
model_pipeline.fit(student_data[features], student_data["AbsencesSum_HS"])

importances = model_pipeline.named_steps['Decision Tree'].feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(7):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

scores = -1 * cross_val_score(run_test(tree, "Decision Tree"),
                              student_data[features],
                              student_data["AbsencesSum_HS"],
                              cv=5,
                              scoring='neg_mean_absolute_error')

#print(np.mean(scores))
"""

'\nmodel_pipeline.fit(student_data[features], student_data["AbsencesSum_HS"])\n\nimportances = model_pipeline.named_steps[\'Decision Tree\'].feature_importances_\nindices = np.argsort(importances)[::-1]\n\n# Print the feature ranking\nprint("Feature ranking:")\n\nfor f in range(7):\n    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))\n\nscores = -1 * cross_val_score(run_test(tree, "Decision Tree"),\n                              student_data[features],\n                              student_data["AbsencesSum_HS"],\n                              cv=5,\n                              scoring=\'neg_mean_absolute_error\')\n\n#print(np.mean(scores))\n'

Testing the optimal number of features.

In [62]:
model_pipeline = Pipeline(steps=[('number_fix', SumTransformer()),
                                 ('pre_process', pre_process),
                                 ('Decision Tree', tree)
                                 ])


print("hell")
%matplotlib inline

#RVEFC for determining optimal number of features??

hell
