In [230]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [231]:
df = pd.read_csv("bike_buyers.csv")

In [232]:
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000.0,1.0,Bachelors,Skilled Manual,Yes,0.0,0-1 Miles,Europe,42.0,No
1,24107,Married,Male,30000.0,3.0,Partial College,Clerical,Yes,1.0,0-1 Miles,Europe,43.0,No
2,14177,Married,Male,80000.0,5.0,Partial College,Professional,No,2.0,2-5 Miles,Europe,60.0,No
3,24381,Single,,70000.0,0.0,Bachelors,Professional,Yes,1.0,5-10 Miles,Pacific,41.0,Yes
4,25597,Single,Male,30000.0,0.0,Bachelors,Clerical,No,0.0,0-1 Miles,Europe,36.0,Yes


In [233]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1000 non-null   int64  
 1   Marital Status    993 non-null    object 
 2   Gender            989 non-null    object 
 3   Income            994 non-null    float64
 4   Children          992 non-null    float64
 5   Education         1000 non-null   object 
 6   Occupation        1000 non-null   object 
 7   Home Owner        996 non-null    object 
 8   Cars              991 non-null    float64
 9   Commute Distance  1000 non-null   object 
 10  Region            1000 non-null   object 
 11  Age               992 non-null    float64
 12  Purchased Bike    1000 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 101.7+ KB


In [234]:
df.shape

(1000, 13)

# DATA SPLIT

### Bagi dataset menjadi training set dan testing set dengan proporsi 70:30

In [235]:
x = df.iloc[:,:-1] #target
y = df.iloc[:,-1] #feature

In [236]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [237]:
print("dimensi x_train : ",X_train.shape)
print("dimensi x_test : ",X_test.shape)
print("dimensi y_train : ",y_train.shape)
print("dimensi y_test : ",y_test.shape)

dimensi x_train :  (700, 12)
dimensi x_test :  (300, 12)
dimensi y_train :  (700,)
dimensi y_test :  (300,)


## NORMALISASI DAN STANDARISASI

* membuat nilai sebelum scalling

In [238]:
# membuat dummy data
data_dummy = df[['ID', 'Income', 'Children',	'Cars','Age']]

print("Nilai data sebelum scalling : \n", data_dummy.head())
print("\n")
print("Nilai Standar Deviasi : \n", np.std(data_dummy))

Nilai data sebelum scalling : 
       ID   Income  Children  Cars   Age
0  12496  40000.0       1.0   0.0  42.0
1  24107  30000.0       3.0   1.0  43.0
2  14177  80000.0       5.0   2.0  60.0
3  24381  70000.0       0.0   1.0  41.0
4  25597  30000.0       0.0   0.0  36.0


Nilai Standar Deviasi : 
 ID           5344.659613
Income      31052.185855
Children        1.626090
Cars            1.121189
Age            11.356278
dtype: float64


In [239]:
data_normalization = pd.DataFrame(x_scaled)

In [240]:
data_normalization = pd.DataFrame(x)

* membuat nilai sesudah scalling

In [241]:
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(data_dummy)

In [242]:
data_normalization = pd.DataFrame(x_scaled)

In [243]:
print("Nilai data sesudah scalling : \n", data_normalization.head())
print("\n")
print("Nilai standar deviasi : ", np.std(x_scaled))

Nilai data sesudah scalling : 
           0       1    2     3         4
0  0.081097  0.1875  0.2  0.00  0.265625
1  0.710522  0.1250  0.6  0.25  0.281250
2  0.172223  0.4375  1.0  0.50  0.546875
3  0.725375  0.3750  0.0  0.25  0.250000
4  0.791294  0.1250  0.0  0.00  0.171875


Nilai standar deviasi :  nan


# DATA CLEANING

* Menangani data Null

Menggunakan data yang sudah di edit

In [244]:
dataframe = pd.read_csv("bike_buyers_edit.csv")
dataframe = df[['ID', 'Marital Status', 'Gender', 'Income',	'Children',	'Education','Occupation',	'Home Owner',	'Cars',	'Commute Distance',	'Region',	'Age',	'Purchased Bike']]

In [245]:
#mengecek data null
dataframe.isna().sum()

ID                   0
Marital Status       7
Gender              11
Income               6
Children             8
Education            0
Occupation           0
Home Owner           4
Cars                 9
Commute Distance     0
Region               0
Age                  8
Purchased Bike       0
dtype: int64

* Imputer menggunakan pandas

In [246]:
df.isna().sum()

ID                   0
Marital Status       7
Gender              11
Income               6
Children             8
Education            0
Occupation           0
Home Owner           4
Cars                 9
Commute Distance     0
Region               0
Age                  8
Purchased Bike       0
dtype: int64

In [247]:
dataframe['Age'] = df['Age'].fillna(df['Age'].mean())

In [248]:
dataframe = dataframe.fillna(df.mean)
dataframe.isna().sum()

ID                  0
Marital Status      0
Gender              0
Income              0
Children            0
Education           0
Occupation          0
Home Owner          0
Cars                0
Commute Distance    0
Region              0
Age                 0
Purchased Bike      0
dtype: int64

In [249]:
# untuk melihat data duplikat
dataframe.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

Karena tidak ada nilai duplikat maka untuk membuat nilai duplikat dengan cara seperti berikut:

In [250]:
dataframe = dataframe.append(df.iloc[0:900])
dataframe.duplicated()

  dataframe = dataframe.append(df.iloc[0:900])


0      False
1      False
2      False
3      False
4      False
       ...  
895     True
896     True
897     True
898     True
899     True
Length: 1900, dtype: bool

* Melakukan data cleaning pada data duplikat

In [251]:
dataframe.drop_duplicates(inplace=True)
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

## Mengganti tipe data

In [252]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1000 non-null   int64  
 1   Marital Status    993 non-null    object 
 2   Gender            989 non-null    object 
 3   Income            994 non-null    float64
 4   Children          992 non-null    float64
 5   Education         1000 non-null   object 
 6   Occupation        1000 non-null   object 
 7   Home Owner        996 non-null    object 
 8   Cars              991 non-null    float64
 9   Commute Distance  1000 non-null   object 
 10  Region            1000 non-null   object 
 11  Age               992 non-null    float64
 12  Purchased Bike    1000 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 101.7+ KB


mengubah tipe data Cars yang awalnya float64 menjadi int64

In [253]:
dfedit= df[['ID']].astype('float64')

In [254]:
dfedit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      1000 non-null   float64
dtypes: float64(1)
memory usage: 7.9 KB


## ONE HOT ENCODING

In [258]:
OneHotEncoder = OneHotEncoder(sparse=False)

In [261]:
# Encoding pada atribut Gender
encoder = OneHotEncoder.fit_transform(df[['Gender']])
df_encoder = pd.DataFrame(encoder)

In [263]:
df_encoder.head(10)

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0
6,0.0,1.0,0.0
7,0.0,1.0,0.0
8,0.0,1.0,0.0
9,0.0,1.0,0.0
