#### Import libraris and load dataset

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('great_customers.csv')
df.head()

Unnamed: 0,user_id,age,workclass,salary,education_rank,marital-status,occupation,race,sex,mins_beerdrinking_year,mins_exercising_year,works_hours,tea_per_year,coffee_per_year,great_customer_class
0,1004889,14.0,private,70773.0,9,Never-married,sales,not_caucasian,Male,0.0,0.0,40,399.0,,0
1,1012811,25.0,private,76597.0,9,Divorced,sales,caucasian,Female,0.0,0.0,30,256.0,,0
2,1006870,21.0,private,47947.25,10,Never-married,clerical,caucasian,Female,0.0,0.0,10,442.0,276.0,0
3,1022149,23.0,private,41740.25,7,Divorced,sales,caucasian,Female,0.0,0.0,20,,,0
4,1029558,26.0,private,,9,Married,sales,not_caucasian,Male,,0.0,36,,120.0,0


In [6]:
df['workclass'].value_counts()

workclass
self_employed    5972
private          5957
government       1127
Name: count, dtype: int64

#### Processing categorical features

In [7]:
encoder = LabelEncoder()

df['workclass']= encoder.fit_transform(df['workclass'])
df['marital-status']= encoder.fit_transform(df['marital-status'])
df['occupation']= encoder.fit_transform(df['occupation'])
df['race']= encoder.fit_transform(df['race'])
df['sex']= encoder.fit_transform(df['race'])

le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

In [8]:
df.isna().sum()

user_id                      0
age                        421
workclass                    0
salary                     422
education_rank               0
marital-status               0
occupation                   0
race                         0
sex                          0
mins_beerdrinking_year     424
mins_exercising_year       421
works_hours                  0
tea_per_year              2429
coffee_per_year           2411
great_customer_class         0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13599 entries, 0 to 13598
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 13599 non-null  int64  
 1   age                     13178 non-null  float64
 2   workclass               13599 non-null  int32  
 3   salary                  13177 non-null  float64
 4   education_rank          13599 non-null  int64  
 5   marital-status          13599 non-null  int32  
 6   occupation              13599 non-null  int32  
 7   race                    13599 non-null  int32  
 8   sex                     13599 non-null  int64  
 9   mins_beerdrinking_year  13175 non-null  float64
 10  mins_exercising_year    13178 non-null  float64
 11  works_hours             13599 non-null  int64  
 12  tea_per_year            11170 non-null  float64
 13  coffee_per_year         11188 non-null  float64
 14  great_customer_class    13599 non-null

#### Impute missing values with iterative imputer, as there are missing values in multiple columns

In [10]:
imputer = IterativeImputer(max_iter=10, random_state=0)

In [11]:
data = df.values
X = data[:, :-1]
y = data[:, -1]
imputer.fit(X)

#### Transform the dataset

In [12]:
X_transform = imputer.transform(X)
df_transform = pd.DataFrame(data=X_transform)


In [13]:
df_transform

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1004889.0,14.0,1.0,70773.000000,9.0,2.0,9.0,1.0,1.0,0.000000,0.00000,40.0,399.000000,119.350213
1,1012811.0,25.0,1.0,76597.000000,9.0,0.0,9.0,0.0,0.0,0.000000,0.00000,30.0,256.000000,252.505812
2,1006870.0,21.0,1.0,47947.250000,10.0,2.0,1.0,0.0,0.0,0.000000,0.00000,10.0,442.000000,276.000000
3,1022149.0,23.0,1.0,41740.250000,7.0,0.0,9.0,0.0,0.0,0.000000,0.00000,20.0,248.243165,281.162515
4,1029558.0,26.0,1.0,53577.192543,9.0,1.0,9.0,1.0,1.0,210.062586,0.00000,36.0,259.203687,120.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13594,1016807.0,42.0,1.0,55293.000000,13.0,1.0,4.0,0.0,0.0,0.000000,96.57369,40.0,277.000000,268.000000
13595,1038859.0,58.0,2.0,25928.250000,14.0,1.0,9.0,0.0,0.0,0.000000,0.00000,40.0,337.000000,408.143910
13596,1041214.0,75.0,2.0,16590.000000,7.0,1.0,4.0,0.0,0.0,283.668101,0.00000,35.0,200.348570,389.903305
13597,1038013.0,45.0,1.0,25536.750000,11.0,0.0,4.0,0.0,0.0,0.000000,0.00000,40.0,99.000000,79.000000


In [14]:
df_transform.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64