In [2]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBClassifier as xgb_class 
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [3]:
def eval_metrics(y_true: list, y_pred: list) -> dict:
    """
    Function to calculate the classification metrics
    
        Args:
            y_test: list, test subset
            y_predicitons: list prediciton on x_test
    
        Returs:

            dict with the keys:
                confusion_matrix - confusion matrix
                accuracy = (TP+TN)/(TP+TN+FP+FN)
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
                f1_score = 2/(1/recall+1/precision)
                specifisity = TN/(TN+FP)
                npv = TN/(TN+FN)
    
    """
    
    cm = confusion_matrix(y_true, y_pred)
    
    # total numb of true and false labels
    labels_real = cm.sum(axis = 1)
    labels_pred = cm.sum(axis = 0)
    n_correct_labels = cm.diagonal().sum()
    accuracy = n_correct_labels/len(y_true)
    recall = cm[1,1]/labels_real[1]
    precision = cm[1,1]/labels_pred[1]    
    nvp = cm[0,0]/labels_pred[0]
    specificity = cm[0,0]/labels_real[0]
        
    return {"confusion_matrix": cm,
            "accuracy": accuracy,
            "recall": recall,
            "precision": precision,
            "f1_score": 2/(1/recall+1/precision),
            "specificity": specificity,
            "npv":nvp
           } 

def save_object(obj, filename):
    """
    Function to save/pickle python object

        Args:
            filename: str path to pickle file
    """

    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


def read_object(filename: str):
    """
    Function to read/un-pickle python object

        Args:
            filename: str path to pickle file
    """

    with open(filename, 'rb') as input_stream:
        obj = pickle.load(input_stream)
    return obj

In [4]:
# read the data to train the model
dat = pd.read_csv('data/warm_cold_colors.csv')

dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 4 columns):
r          136 non-null int64
g          136 non-null int64
b          136 non-null int64
is_warm    136 non-null int64
dtypes: int64(4)
memory usage: 4.3 KB


In [5]:
dat.head()

Unnamed: 0,r,g,b,is_warm
0,254,37,0,1
1,221,39,123,1
2,254,89,0,1
3,254,153,0,1
4,255,221,0,1


In [6]:
# reshuffle data
seed = 2019

dat = dat.sample(len(dat), random_state=seed).reset_index(drop=True)

## Modelling

In [7]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(dat.drop(columns='is_warm'), dat['is_warm'], 
                                                    test_size=0.2, random_state=seed)

In [8]:
# init the model and train it

params = {
    "objective": 'binary:logistic',
    "learning_rate": 0.5, 
    "n_estimators": 100, 
    "max_depth": 3,
    "n_jobs": 4,
    "silent": False, 
    "subsample": 0.8,
    "random_state": seed
}

model = xgb_class(**params)

model.fit(x_train, y_train, verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='binary:logistic',
       random_state=2019, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=False, subsample=0.8)

### Evaluation

In [9]:
# predict based on train set
y_train_pred = model.predict(x_train)

# predict based on test set
y_test_pred = model.predict(x_test)

In [10]:
# model evaluation

metrics_train = eval_metrics(y_train, y_train_pred)
metrics_test = eval_metrics(y_test, y_test_pred)

In [11]:
metrics_test

{'confusion_matrix': array([[12,  1],
        [ 1, 14]]),
 'accuracy': 0.9285714285714286,
 'recall': 0.9333333333333333,
 'precision': 0.9333333333333333,
 'f1_score': 0.9333333333333333,
 'specificity': 0.9230769230769231,
 'npv': 0.9230769230769231}

In [12]:
metrics_train

{'confusion_matrix': array([[55,  0],
        [ 1, 52]]),
 'accuracy': 0.9907407407407407,
 'recall': 0.9811320754716981,
 'precision': 1.0,
 'f1_score': 0.9904761904761905,
 'specificity': 1.0,
 'npv': 0.9821428571428571}

### Random test

In [13]:
def predictor(rgb:pd.DataFrame, model) -> float:
    
    _col = 'color_tone'
    try:
        rgb[_col] = pd.DataFrame(model.predict(rgb, validate_features=False)).\
                                replace([0, 1], ["cool", "warm"])
    except Exception as e:
        raise e
        
    return rgb    

In [14]:
# test color
test_point = {'r': [8], 'g': [103], 'b': [203]}

In [15]:
# predict the category

is_warm = model.predict(pd.DataFrame(test_point)).squeeze()

print(f"The color is warm? {bool(is_warm)}")

The color is warm? False


### Saving the model

In [16]:
save_object(model, 'model/model_v1.xgb')