In [6]:
!lscpu

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                1
On-line CPU(s) list:   0
Thread(s) per core:    1
Core(s) per socket:    1
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 79
Model name:            Intel(R) Xeon(R) CPU @ 2.20GHz
Stepping:              0
CPU MHz:               2200.000
BogoMIPS:              4400.00
Hypervisor vendor:     KVM
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              56320K
NUMA node0 CPU(s):     0
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdr

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import exp as exp
from MODS.IDnumbers import IDnumbers
%matplotlib inline

# START machine learning
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import tree

In [2]:
HR = pd.read_csv("HRIN.csv")

# Drop useless variables
# find way to automate without looking at plots
HR.drop(["EmployeeCount","Over18","StandardHours"], axis = 1, inplace=True)

# Dealing with CATEGORICAL variables
# # # # # # # # # # # #
# Method 1:
# One-Hot encoding
# # # # # # # # # # # #

recast = pd.get_dummies(HR, sparse=True, drop_first=True)
recast_corr = pd.DataFrame.dropna(recast.corr(), axis=[0,1],how="all")

# # # # # # # # # # # #
# Method 2:
# Label Encoding
# # # # # # # # # # # #
#
# Convert all Object dtypes to Categorical

def TO_CAT(col):
    if col.dtype == "object":
        return col.astype("category").cat.codes
    else:
        return col
    
encoding = HR.apply(TO_CAT, axis=0)
encoding_corr = pd.DataFrame.dropna(encoding.corr(), axis=[0,1],how="all")

In [4]:
# Functions::
#
def SPLIT_DATA(DATA=encoding, RAND=True, Testsize=0.3):
    x_train, x_test, y_train, y_test = train_test_split(DATA.drop("Attrition",axis=1), 
                                                        DATA["Attrition"], test_size = Testsize)
    
    if RAND == True:
        list_of_random_items = GET_RANDOM(DATA=x_train)
        #x_train.drop(list_of_random_items, axis = 1, inplace= True)

        [DataSet.drop(list_of_random_items, axis = 1, inplace = True) for DataSet in [x_train,x_test]]
    
    return x_train, x_test, y_train, y_test

# RANDOM variable selection
def GET_RANDOM(DATA):
    group_of_items = set(DATA.columns)
    num_to_select = np.random.randint(1,len(group_of_items))
    list_of_random_items = random.sample(group_of_items, num_to_select)
    
    return list_of_random_items

# TO:DO
# IMPROVE only MODEL initiation changes, reduce code
#

def TRAIN_MODEL_LOG(RAND=True, Testsize=0.3):
    # SPLIT Data, using CALL SPLIT_DATA function
    # To include ALL variables select RAND = False
    # Testsize default = 0.3
    x_train, x_test, y_train, y_test = SPLIT_DATA(RAND=RAND, Testsize=Testsize)
    
    # USE LogisticRegression from SKLEARN
    model = LogisticRegression()    
    # FIT the model with training input
    model.fit(x_train,y_train)  
    
    # PREDICT using LRM with test data set
    PREDICTIONS = model.predict(x_test)
    
    # evaluate predictions 
    accuracy = accuracy_score(y_test,PREDICTIONS)
    # CALL ConfMatrix function
    ConfMatrix, INCORRECT_T1, INCORRECT_T2 = CONF_MATRIX(y_test, PREDICTIONS)
    
    # 
    #OUTLIST = [accuracy, INCORRECT_T1, INCORRECT_T2, len(x_train.columns), x_train.columns.tolist(), ConfMatrix.flatten()]    
    OUTLIST = x_train, x_test, y_train, y_test, accuracy, model.coef_, model.intercept_, PREDICTIONS, model
    return OUTLIST   

## END TO:DO

def CONF_MATRIX(y_test, PREDICTIONS):
    # CREATE confusion matrix and get TI TII errors
    ConfMatrix = confusion_matrix(y_test, PREDICTIONS)
    # CALL TI TII function
    INCORRECT_T1,INCORRECT_T2 = TITII(DATA = ConfMatrix)
    
    return ConfMatrix, INCORRECT_T1, INCORRECT_T2

def TITII(DATA):
    CORRECT_TRUE = DATA[-1,-1]/DATA[-1,:].sum()
    INCORRECT_T2 = 1-CORRECT_TRUE
    CORRECT_FALSE = DATA[0,0]/DATA[0,:].sum()
    INCORRECT_T1 = 1-CORRECT_FALSE
    
    return INCORRECT_T1,INCORRECT_T2  

# Change colour palette
def MAKE_HM(MODEL_NAME, ConfMatrix = None, y_test = None, PREDICTIONS = None, Score="Not Given"):
    plt.figure(figsize=(5,5))
    if ConfMatrix is None:
        ConfMatrix, INCORRECT_T1, INCORRECT_T2 = CONF_MATRIX(y_test,PREDICTIONS)
    else:
        ConfMatrix = ConfMatrix.reshape(2,2)
        INCORRECT_T1,INCORRECT_T2 = TITII(ConfMatrix)
    
    sns.heatmap(ConfMatrix, annot=True, fmt=".3f", linewidths=1, square=True, cmap="icefire", cbar=False) 
    
    # LABELS & TITLES
    plt.ylabel("Actual label")
    plt.xlabel("Predicted label")
    plt.suptitle("The Score for the {} is : {}, Type II : {}".format(MODEL_NAME,Score,INCORRECT_T2))
    plt.show()    

In [5]:
%%time
# RUN Models n-times
# TO:DO find run-time for each function call and optimize
# WriteIO operations probabely take long

ModelResults = []
for i in range(0,200):    
    ModelResults.append(TRAIN_MODEL_LOG())
    if i % 25 == 0:
        print(i)
    
    # Store on local drive in case of crashes
    #WriteIO.GetFile(Mode="a", WriteString = TRAIN_MODEL_LOG(), FileName="IDnumbers_LRM_T1")
    #WriteIO.GetFile(Mode="a", WriteString = TRAIN_MODEL_XGBoost_1(), FileName="IDnumbers_XGB_T1")
    
#ModelResults[4] = ModelResults[3].str.len()

0
25
50
75
100
125
150
175
CPU times: user 30.5 s, sys: 74.1 ms, total: 30.6 s
Wall time: 31.1 s
