In [1]:
# import packages

import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.svm
import sklearn.tree
import sklearn.naive_bayes
import sklearn.neighbors
import sklearn.preprocessing
import sklearn.linear_model

In [3]:
# read in all datasets

ADULT_df = pd.read_csv('./dataset/adult.data', names=['age', 'workclass', 
                                                      'fnlwgt', 'education', 'education-num','marital-status',
                                                     'occupation', 'relationship','race', 'sex',
                                                     'capital-gain', 'capital-loss','hoursperweek',
                                                     'country', 'label']) # ADULT dataset
AUG_df = pd.read_csv('./dataset/aug_train.csv') # data scientists dataset
BANK_df = pd.read_csv('./dataset/bank-full.csv', delimiter=';')
INS_df = pd.read_csv('./dataset/insurance_train.csv') #insurance dataset
AUS_df = pd.read_csv('./dataset/weatherAUS.csv') # australian weather dataset

# Preprocess ADULT

In [4]:
ADULT_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hoursperweek,country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Encode Categorical value and label

In [5]:
labelenc = sklearn.preprocessing.OrdinalEncoder()
ADULT_df2 = ADULT_df.copy()
ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']] = labelenc.fit_transform(ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']])

In [6]:
ADULT_categorical = ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']]

In [7]:
ADULT_df2 = ADULT_df2.drop(['workclass', 'education','marital-status','occupation','relationship','race','sex','country'],axis=1)

In [8]:
ADULT_df['label'] = ADULT_df2['label'].apply(lambda x: 1 if x[1] == '>' else 0)

In [9]:
ADULT_df = ADULT_df.drop(['workclass','marital-status','occupation','relationship','race','sex','country'],axis=1)

In [10]:
ADULT_df = ADULT_df.join(ADULT_categorical)

In [54]:
ADULT_df.drop(['education'], axis=1).to_csv('./dataset/ADULT_processed.csv')

## ADULT - Train-test split

Train size 5000, test size 27561

In [12]:
adult_train, adult_test = sklearn.model_selection.train_test_split(ADULT_df, train_size=5000)

# Preprocess AUG

In [33]:
AUG_df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [34]:
categorical_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline','experience','company_size','company_type','last_new_job']

In [35]:
AUG_categorical = AUG_df[categorical_cols]

In [36]:
AUG_df.drop(categorical_cols, axis=1, inplace=True)

In [37]:
AUG_df

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
0,8949,0.920,36,1.0
1,29725,0.776,47,0.0
2,11561,0.624,83,0.0
3,33241,0.789,52,1.0
4,666,0.767,8,0.0
...,...,...,...,...
19153,7386,0.878,42,1.0
19154,31398,0.920,52,1.0
19155,24576,0.920,44,0.0
19156,5756,0.802,97,0.0


In [38]:
AUG_categorical = AUG_categorical.fillna('Unknown')

In [39]:
AUG_categorical

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Unknown,Unknown,1
1,city_40,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4
2,city_21,Unknown,No relevent experience,Full time course,Graduate,STEM,5,Unknown,Unknown,never
3,city_115,Unknown,No relevent experience,Unknown,Graduate,Business Degree,<1,Unknown,Pvt Ltd,never
4,city_162,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4
...,...,...,...,...,...,...,...,...,...,...
19153,city_173,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,Unknown,Unknown,1
19154,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,Unknown,Unknown,4
19155,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4
19156,city_65,Male,Has relevent experience,no_enrollment,High School,Unknown,<1,500-999,Pvt Ltd,2


In [43]:
labelenc = sklearn.preprocessing.OrdinalEncoder()
AUG_categorical[categorical_cols] = labelenc.fit_transform(AUG_categorical)

In [45]:
AUG_df = AUG_df.join(AUG_categorical)

In [46]:
AUG_df

Unnamed: 0,enrollee_id,city_development_index,training_hours,target,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,8949,0.920,36,1.0,5.0,1.0,0.0,3.0,0.0,5.0,21.0,8.0,6.0,0.0
1,29725,0.776,47,0.0,77.0,1.0,1.0,3.0,0.0,5.0,6.0,4.0,5.0,4.0
2,11561,0.624,83,0.0,64.0,3.0,1.0,0.0,0.0,5.0,15.0,8.0,6.0,6.0
3,33241,0.789,52,1.0,14.0,3.0,1.0,2.0,0.0,1.0,20.0,8.0,5.0,6.0
4,666,0.767,8,0.0,50.0,1.0,0.0,3.0,2.0,5.0,21.0,4.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,0.878,42,1.0,55.0,1.0,1.0,3.0,0.0,2.0,5.0,8.0,6.0,0.0
19154,31398,0.920,52,1.0,5.0,1.0,0.0,3.0,0.0,5.0,5.0,8.0,6.0,3.0
19155,24576,0.920,44,0.0,5.0,1.0,0.0,3.0,0.0,5.0,21.0,4.0,5.0,3.0
19156,5756,0.802,97,0.0,94.0,1.0,0.0,3.0,1.0,6.0,20.0,5.0,5.0,1.0


In [49]:
AUG_df.to_csv('./dataset/AUG_processed.csv') # save preprocessed to file

# Preprocess BANK

In [56]:
BANK_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
