In [28]:
%load_ext autoreload
import pipeline as p
import loops as l
import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [29]:
df = p.load_data()

AttributeError: module 'pipeline' has no attribute 'load_data'

# Create binary outcome label

In [13]:
df.loc[:,'top_10_eviction_rate'] = np.where(df['eviction-rate'] >= df['eviction-rate'].quantile(.9), 1,0)

# Generate train and test sets

In [14]:
x_train, y_train, x_test, y_test = p.get_train_test_splits(df)

# Clean data

In [15]:
for df in [x_train, x_test]:
    p.fill_continuous_null(df, p.FEATURES)

# Create features

In [16]:
for col in p.FEATURES:
    x_train = p.discretize(x_train, col, 50)

In [17]:
for col in p.FEATURES:
    x_test = p.discretize(x_test, col, 50)

In [18]:
x_train['label'] = 'train'
x_test['label'] = 'score'
concat_df = pd.concat([x_train , x_test])
features_df = pd.get_dummies(concat_df, columns=p.FEATURES)

In [19]:
x_train = features_df[features_df['label'] == 'train']
x_test = features_df[features_df['label'] == 'score']
                             
x_train = x_train.drop('label', axis=1)
x_test = x_test.drop('label', axis=1)

# Define models and parameters

In [20]:
clfs, grid = l.define_clfs_params('test')

In [21]:
models_to_run = ['RF','DT','GB','LR','KNN']

# Loop for all models

In [22]:
output =l.clf_loop(models_to_run, clfs, grid, x_train, x_test, y_train, y_test)

RF
DT
GB
LR
KNN


# Sort according to best model

In [23]:
output.sort_values('auc-roc', ascending=False)

Unnamed: 0,model_type,clf,parameters,baseline,auc-roc,f1_at_5,f1_at_10,a_at_5,a_at_10,p_at_1,...,p_at_20,p_at_30,p_at_50,r_at_1,r_at_2,r_at_5,r_at_10,r_at_20,r_at_30,r_at_50
22,LR,"LogisticRegression(C=0.1, class_weight=None, d...","{'C': 0.01, 'penalty': 'l2'}",top_10_eviction_rate 0.082285 dtype: float64,0.955736,0.571429,0.691643,0.943396,0.94392,1.0,...,0.377953,0.262238,0.162474,0.121019,0.235669,0.458599,0.764331,0.917197,0.955414,0.987261
24,LR,"LogisticRegression(C=0.1, class_weight=None, d...","{'C': 0.1, 'penalty': 'l2'}",top_10_eviction_rate 0.082285 dtype: float64,0.951716,0.595238,0.70317,0.946541,0.946017,1.0,...,0.377953,0.262238,0.160377,0.121019,0.235669,0.477707,0.77707,0.917197,0.955414,0.974522
5,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'n_estimators': 10, 'max_depth': 5, 'min_samp...",top_10_eviction_rate 0.082285 dtype: float64,0.943188,0.436508,0.582133,0.925577,0.924004,0.894737,...,0.370079,0.267483,0.163522,0.10828,0.165605,0.350318,0.643312,0.898089,0.974522,0.993631
18,GB,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.1, 'n_estimators': 100, 'm...",top_10_eviction_rate 0.082285 dtype: float64,0.923958,0.507937,0.605187,0.93501,0.928197,0.894737,...,0.372703,0.248252,0.16457,0.10828,0.203822,0.407643,0.66879,0.904459,0.904459,1.0
4,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'n_estimators': 5, 'max_depth': 5, 'min_sampl...",top_10_eviction_rate 0.082285 dtype: float64,0.913825,0.34127,0.380403,0.912998,0.887317,0.684211,...,0.354331,0.236014,0.160377,0.082803,0.171975,0.273885,0.420382,0.859873,0.859873,0.974522
19,GB,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.1, 'n_estimators': 50, 'ma...",top_10_eviction_rate 0.082285 dtype: float64,0.899117,0.420635,0.466859,0.92348,0.90304,0.789474,...,0.351706,0.234266,0.16457,0.095541,0.152866,0.33758,0.515924,0.853503,0.853503,1.0
12,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'entropy', 'max_depth': 5, 'min_...",top_10_eviction_rate 0.082285 dtype: float64,0.895083,0.396825,0.466859,0.920335,0.90304,0.894737,...,0.346457,0.230769,0.16457,0.10828,0.184713,0.318471,0.515924,0.840764,0.840764,1.0
7,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_depth': 5, 'min_sam...",top_10_eviction_rate 0.082285 dtype: float64,0.895083,0.396825,0.466859,0.920335,0.90304,0.894737,...,0.346457,0.230769,0.16457,0.10828,0.184713,0.318471,0.515924,0.840764,0.840764,1.0
13,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'entropy', 'max_depth': 5, 'min_...",top_10_eviction_rate 0.082285 dtype: float64,0.894757,0.396825,0.466859,0.920335,0.90304,0.842105,...,0.346457,0.230769,0.16457,0.101911,0.184713,0.318471,0.515924,0.840764,0.840764,1.0
14,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'entropy', 'max_depth': 5, 'min_...",top_10_eviction_rate 0.082285 dtype: float64,0.894733,0.396825,0.466859,0.920335,0.90304,0.842105,...,0.346457,0.230769,0.16457,0.101911,0.178344,0.318471,0.515924,0.840764,0.840764,1.0


In [25]:
output.loc[22,:]

model_type                                                   LR
clf           LogisticRegression(C=0.1, class_weight=None, d...
parameters                         {'C': 0.01, 'penalty': 'l2'}
baseline        top_10_eviction_rate    0.082285
dtype: float64
auc-roc                                                0.955736
f1_at_5                                                0.571429
f1_at_10                                               0.691643
a_at_5                                                 0.943396
a_at_10                                                 0.94392
p_at_1                                                        1
p_at_2                                                 0.973684
p_at_5                                                 0.757895
p_at_10                                                0.631579
p_at_20                                                0.377953
p_at_30                                                0.262238
p_at_50                                 