In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [2]:
def eval_metrics(y_true: list, y_pred: list) -> dict:
    """
    Function to calculate the classification metrics
    
        Args:
            y_test: list, test subset
            y_predicitons: list prediciton on x_test
    
        Returs:

            dict with the keys:
                confusion_matrix - confusion matrix
                accuracy = (TP+TN)/(TP+TN+FP+FN)
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
                f1_score = 2/(1/recall+1/precision)
                specifisity = TN/(TN+FP)
                npv = TN/(TN+FN)
    
    """
    
    cm = confusion_matrix(y_true, y_pred)
    
    # total numb of true and false labels
    labels_real = cm.sum(axis = 1)
    labels_pred = cm.sum(axis = 0)
    n_correct_labels = cm.diagonal().sum()
    accuracy = n_correct_labels/len(y_true)
    recall = cm[1,1]/labels_real[1]
    precision = cm[1,1]/labels_pred[1]    
    nvp = cm[0,0]/labels_pred[0]
    specificity = cm[0,0]/labels_real[0]
        
    return {"confusion_matrix": cm,
            "accuracy": accuracy,
            "recall": recall,
            "precision": precision,
            "f1_score": 2/(1/recall+1/precision),
            "specificity": specificity,
            "npv":nvp
           } 

def model_eval(x_train, y_train, 
              x_test, y_test, 
              model):
    """
    Function to evaluate model
    """
    y_train_predict = model.predict(x_train)
    y_test_predict = model.predict(x_test)
    
    # metrics
    metrics_train = eval_metrics(y_train, y_train_predict)
    metrics_test = eval_metrics(y_test, y_test_predict)
    
    return metrics_train, metrics_test

def save_object(obj, filename):
    """
    Function to save/pickle python object

        Args:
            filename: str path to pickle file
    """

    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


def read_object(filename: str):
    """
    Function to read/un-pickle python object

        Args:
            filename: str path to pickle file
    """

    with open(filename, 'rb') as input_stream:
        obj = pickle.load(input_stream)
    return obj

def point_test(test_point: dict, model) -> bool:
    
    """
    Function to run a prediciton on a data point
    """
    try:
        is_warm = model.predict(pd.DataFrame(test_point)).squeeze()
        return bool(is_warm)
    
    except Exception as e:
        raise e
        return

In [3]:
# read the data to train the model
dat = pd.read_csv('data/warm_cold_colors.csv')

dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 4 columns):
r          136 non-null int64
g          136 non-null int64
b          136 non-null int64
is_warm    136 non-null int64
dtypes: int64(4)
memory usage: 4.3 KB


In [4]:
dat.head()

Unnamed: 0,r,g,b,is_warm
0,254,37,0,1
1,221,39,123,1
2,254,89,0,1
3,254,153,0,1
4,255,221,0,1


In [5]:
# reshuffle data
seed = 2019

dat = dat.sample(len(dat), random_state=seed).reset_index(drop=True)

# Modelling

In [6]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(dat.drop(columns='is_warm'), dat['is_warm'], 
                                                    test_size=0.2, random_state=seed)

In [7]:
# test point -> should be cool, 0 class

test_point = {'r': [8], 'g': [103], 'b': [203]}

## knn

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
params = {    
    "weights": ["distance", "uniform"], 
    "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    "n_neighbors": [i for i in range(3,11,1)]
}

In [10]:
grid = GridSearchCV(KNeighborsClassifier(), params, scoring='f1', iid=False, cv=10)

In [11]:
%%time
grid.fit(x_train, y_train)

CPU times: user 4.15 s, sys: 42 ms, total: 4.19 s
Wall time: 4.56 s


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'weights': ['distance', 'uniform'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [12]:
cv_score = grid.cv_results_

In [13]:
cv_score['mean_train_score']



array([0.99153367, 0.90600484, 0.99153367, 0.88277085, 0.99153367,
       0.87868535, 0.99153367, 0.86454471, 0.99153367, 0.87905628,
       0.99153367, 0.85599998, 0.99153367, 0.85502744, 0.99153367,
       0.85426755, 0.99153367, 0.90600484, 0.99153367, 0.88277085,
       0.99153367, 0.87868535, 0.99153367, 0.86454471, 0.99153367,
       0.87905628, 0.99153367, 0.85599998, 0.99153367, 0.85502744,
       0.99153367, 0.85426755, 0.99153367, 0.90600484, 0.99153367,
       0.88277085, 0.99153367, 0.87868535, 0.99153367, 0.86454471,
       0.99153367, 0.87905628, 0.99153367, 0.85599998, 0.99153367,
       0.85502744, 0.99153367, 0.85426755, 0.99153367, 0.90640486,
       0.99153367, 0.88277085, 0.99153367, 0.87868535, 0.99153367,
       0.86454471, 0.99153367, 0.87905628, 0.99153367, 0.85599998,
       0.99153367, 0.85502744, 0.99153367, 0.85426755])

In [14]:
knn_model = grid.best_estimator_

In [15]:
metrics_train, metrics_test = model_eval(x_train, y_train, x_test, y_test, knn_model)

In [16]:
metrics_train['f1_score']

0.8823529411764706

In [17]:
metrics_test['f1_score']

0.7692307692307692

In [18]:
save_object(knn_model, 'model/model_v0.knn')

### Point prediction

In [19]:
point_test(test_point, knn_model)

False

## xgboost

In [20]:
from xgboost import XGBClassifier as xgb_class 

In [21]:
# init the model and train it

params = {
    "objective": 'binary:logistic',
    "learning_rate": 0.5, 
    "n_estimators": 100, 
    "max_depth": 3,
    "n_jobs": 4,
    "silent": False, 
    "subsample": 0.8,
    "random_state": seed
}

model_xgb = xgb_class(**params)

model_xgb.fit(x_train, y_train, verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='binary:logistic',
       random_state=2019, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=False, subsample=0.8)

### Evaluation

In [22]:
metrics_train, metrics_test = model_eval(x_train, y_train, x_test, y_test, model_xgb)

In [23]:
metrics_train['f1_score']

0.9904761904761905

In [24]:
metrics_test['f1_score']

0.9333333333333333

### Point test

In [25]:
point_test(test_point, model_xgb)

False

### Saving the model

In [26]:
save_object(model_xgb, 'model/model_v1.xgb')