In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Missing Values


In [3]:
data = pd.read_csv(f"data/preprocessing/missing_values.csv")

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

numeric_data = data.iloc[:, 1:4].values

imputer.fit(numeric_data)

numeric_data = imputer.transform(numeric_data)

data.iloc[:, 1:4] = numeric_data

data

Unnamed: 0,ulke,boy,kilo,yas,cinsiyet
0,tr,130,30,10.0,e
1,tr,125,36,11.0,e
2,tr,135,34,10.0,k
3,tr,133,30,9.0,k
4,tr,129,38,12.0,e
5,tr,180,90,30.0,e
6,tr,190,80,25.0,e
7,tr,175,90,35.0,e
8,tr,177,60,22.0,k
9,us,185,105,33.0,e


In [4]:
data = pd.read_csv(r"data/preprocessing/data.csv")
data

Unnamed: 0,ulke,boy,kilo,yas,cinsiyet
0,tr,130,30,10,e
1,tr,125,36,11,e
2,tr,135,34,10,k
3,tr,133,30,9,k
4,tr,129,38,12,e
5,tr,180,90,30,e
6,tr,190,80,25,e
7,tr,175,90,35,e
8,tr,177,60,22,k
9,us,185,105,33,e


In [5]:
data.drop(columns=["ulke", "cinsiyet"]).corr()["yas"]

boy     0.508706
kilo    0.423259
yas     1.000000
Name: yas, dtype: float64

# Data Types

-   **Categorical**: Data that can be divided into categories or groups.

    -   **Nominal**: Categories without any intrinsic order or ranking. Examples: gender, color, country.
    -   **Ordinal**: Categories with a meaningful order or ranking, but without a consistent interval between them. Examples: education level (high school, bachelor's, master's, PhD), customer satisfaction (low, medium, high).

-   **Numerical**: Data that represents quantities and can be measured.
    -   **Ratio**: Data with a true zero point and meaningful intervals between values. Examples: weight, height, age.
    -   **Interval**: Data with meaningful intervals between values but no true zero point. Examples: temperature (Celsius), IQ scores.


# Categorical


In [6]:
data = pd.read_csv(r"data/preprocessing/data.csv")
le = LabelEncoder()
ohe = OneHotEncoder()
country = data.iloc[:, 0:1].values
country

array([['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['tr'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['us'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr'],
       ['fr']], dtype=object)

In [7]:
country = le.fit_transform(data.iloc[:,0])
country = country.reshape(-1, 1)
country

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [8]:
country = ohe.fit_transform(country).toarray()
country

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

# Data Merging


In [9]:
country_df = pd.DataFrame(data=country, columns=["fr", "tr", "us"])
country_df

Unnamed: 0,fr,tr,us
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,0.0,1.0,0.0
8,0.0,1.0,0.0
9,0.0,0.0,1.0


In [10]:
numeric_df = pd.DataFrame(data=numeric_data, columns=["boy", "kilo", "yas"])
numeric_df

Unnamed: 0,boy,kilo,yas
0,130.0,30.0,10.0
1,125.0,36.0,11.0
2,135.0,34.0,10.0
3,133.0,30.0,9.0
4,129.0,38.0,12.0
5,180.0,90.0,30.0
6,190.0,80.0,25.0
7,175.0,90.0,35.0
8,177.0,60.0,22.0
9,185.0,105.0,33.0


In [11]:
gender_data = data.iloc[:, -1].values
gender_df = pd.DataFrame(data=gender_data, columns=["cinsiyet"])
gender_df

Unnamed: 0,cinsiyet
0,e
1,e
2,k
3,k
4,e
5,e
6,e
7,e
8,k
9,e


In [12]:
# merging
final_data = pd.concat([country_df, numeric_df], axis=1)
final_data

Unnamed: 0,fr,tr,us,boy,kilo,yas
0,0.0,1.0,0.0,130.0,30.0,10.0
1,0.0,1.0,0.0,125.0,36.0,11.0
2,0.0,1.0,0.0,135.0,34.0,10.0
3,0.0,1.0,0.0,133.0,30.0,9.0
4,0.0,1.0,0.0,129.0,38.0,12.0
5,0.0,1.0,0.0,180.0,90.0,30.0
6,0.0,1.0,0.0,190.0,80.0,25.0
7,0.0,1.0,0.0,175.0,90.0,35.0
8,0.0,1.0,0.0,177.0,60.0,22.0
9,0.0,0.0,1.0,185.0,105.0,33.0


# Test Train Split


In [13]:
x_train, x_test, y_train, y_test = train_test_split(final_data, gender_df, test_size=.33, random_state=0)

# Feature Scaling


In [14]:
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)