In [19]:
!pip install liac-arff



In [20]:
# import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import requests
import arff


# get data from online
df_arff = requests.get("https://utexas.box.com/shared/static/wpc25cs5ep3j1ewpfs2scoiekra22p8s.arff")

# read as arff file
df_arff = arff.load(df_arff.text)
col_val = [attribute[0] for attribute in df_arff['attributes']]

# transform arff file into pandas dataframe
new_df = pd.DataFrame(df_arff['data'], columns = col_val)
# get information about the different attributes (name and type)
meta = df_arff['attributes']
meta

[('duration', 'NUMERIC'),
 ('wage-increase-first-year', 'NUMERIC'),
 ('wage-increase-second-year', 'NUMERIC'),
 ('wage-increase-third-year', 'NUMERIC'),
 ('cost-of-living-adjustment', ['none', 'tcf', 'tc']),
 ('working-hours', 'NUMERIC'),
 ('pension', ['none', 'ret_allw', 'empl_contr']),
 ('standby-pay', 'NUMERIC'),
 ('shift-differential', 'NUMERIC'),
 ('education-allowance', ['yes', 'no']),
 ('statutory-holidays', 'NUMERIC'),
 ('vacation', ['below_average', 'average', 'generous']),
 ('longterm-disability-assistance', ['yes', 'no']),
 ('contribution-to-dental-plan', ['none', 'half', 'full']),
 ('bereavement-assistance', ['yes', 'no']),
 ('contribution-to-health-plan', ['none', 'half', 'full']),
 ('class', ['bad', 'good'])]

In [21]:
new_df.head()

Unnamed: 0,duration,wage-increase-first-year,wage-increase-second-year,wage-increase-third-year,cost-of-living-adjustment,working-hours,pension,standby-pay,shift-differential,education-allowance,statutory-holidays,vacation,longterm-disability-assistance,contribution-to-dental-plan,bereavement-assistance,contribution-to-health-plan,class
0,1.0,5.0,,,,40.0,,,2.0,,11.0,average,,,yes,,good
1,2.0,4.5,5.8,,,35.0,ret_allw,,,yes,11.0,below_average,,full,,full,good
2,,,,,,38.0,empl_contr,,5.0,,11.0,generous,yes,half,yes,half,good
3,3.0,3.7,4.0,5.0,tc,,,,,yes,,,,,yes,,good
4,3.0,4.5,4.5,5.0,,40.0,,,,,12.0,average,,half,yes,half,good


SimpleImputer - Column by column

In [22]:
from sklearn.impute import SimpleImputer


df=new_df
# Create a SimpleImputer object
imputer = SimpleImputer(strategy='mean')
imputer2= SimpleImputer(strategy='most_frequent')

# Fit the imputer to the dataset
for column in df:
  try:
    # If the column is not of type object (categorical), then impute using the mean
    if df[column].dtype != "object":
      print(df[column].dtype, column)
      df[column] = imputer.fit_transform(df[[column]])
    # Else impute using the most frequent value
    else:
      df[column] = imputer2.fit_transform(df[[column]])
  except:
      pass      

df

float64 duration
float64 wage-increase-first-year
float64 wage-increase-second-year
float64 wage-increase-third-year
float64 working-hours
float64 standby-pay
float64 shift-differential
float64 statutory-holidays


Unnamed: 0,duration,wage-increase-first-year,wage-increase-second-year,wage-increase-third-year,cost-of-living-adjustment,working-hours,pension,standby-pay,shift-differential,education-allowance,statutory-holidays,vacation,longterm-disability-assistance,contribution-to-dental-plan,bereavement-assistance,contribution-to-health-plan,class
0,1.0,5.0,3.971739,3.913333,,40.0,,7.444444,2.0,,11.0,average,,,yes,,good
1,2.0,4.5,5.8,3.913333,,35.0,ret_allw,7.444444,4.870968,yes,11.0,below_average,,full,,full,good
2,2.160714,3.803571,3.971739,3.913333,,38.0,empl_contr,7.444444,5.0,,11.0,generous,yes,half,yes,half,good
3,3.0,3.7,4.0,5.0,tc,38.039216,,7.444444,4.870968,yes,11.09434,,,,yes,,good
4,3.0,4.5,4.5,5.0,,40.0,,7.444444,4.870968,,12.0,average,,half,yes,half,good
5,2.0,2.0,2.5,3.913333,,35.0,,7.444444,6.0,yes,12.0,average,,,,,good
6,3.0,4.0,5.0,5.0,tc,38.039216,empl_contr,7.444444,4.870968,,12.0,generous,yes,none,yes,half,good
7,3.0,6.9,4.8,2.3,,40.0,,7.444444,3.0,,12.0,below_average,,,,,good
8,2.0,3.0,7.0,3.913333,,38.0,,12.0,25.0,yes,11.0,below_average,yes,half,yes,,good
9,1.0,5.7,3.971739,3.913333,none,40.0,empl_contr,7.444444,4.0,,11.0,generous,yes,full,,,good


OneHotEncoder

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
from sklearn.neighbors import KNeighborsClassifier


X = new_df.drop(columns='class')
y = new_df['class']
print('Original size', X.shape)

from sklearn import preprocessing

le = preprocessing.OneHotEncoder()
X = le.fit_transform(X)
X=X.toarray()
# Lead to increased dimensionality, as a separate column is created for each category in the variable. 
# New columns are created for empty values
# Can also lead to overfitting, especially if there are many categories in the variable and the sample size is relatively small
print('Transformed size', X.shape)

model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5).fit(X, y)
# model = MultinomialNB(force_alpha=True).fit(X, y)
# model = CategoricalNB(force_alpha=True).fit(X, y)
# model = KNeighborsClassifier().fit(X, y)
# model = RandomForestClassifier().fit(X, y)

# Calculate the AUC score using cross-validation
auc_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

# Print the AUC scores for each fold of cross-validation
print("AUC score Classification Tree:", auc_scores.mean())

Original size (57, 16)
Transformed size (57, 112)
AUC score Classification Tree: 0.8267857142857142


In [24]:
X

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])