<a href="https://colab.research.google.com/github/ku1esh00v/AI_Machine_Learning_5/blob/main/AI_%26_ML_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Лабораторная работа 5. Разработка единого шаблона предварительной обработки данных**

**Подключение библиотек**

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

**Загрузка данных и разделение на матрицу признаков и зависимую переменную**

In [19]:
# Решил воспользоваться тем же набором данных, однако сгенерировал свои значения и названия стран, а так же добавил новый пропуск в данных.
dataset = pd.read_csv('/content/dataset.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,Russia,65.0,31000.0,Yes
1,France,39.0,26000.0,No
2,UK,35.0,34000.0,No
3,USA,45.0,51000.0,No
4,UK,37.0,,Yes


In [20]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
print ("Матрица признаков"); print(X)
print ("Зависимая переменная"); print(y)

Матрица признаков
[['Russia' 65.0 31000.0]
 ['France' 39.0 26000.0]
 ['UK' 35.0 34000.0]
 ['USA' 45.0 51000.0]
 ['UK' 37.0 nan]
 ['France' nan 49000.0]
 ['USA' 47.0 52000.0]
 ['France' 48.0 63000.0]
 ['UK' 45.0 89000.0]
 ['France' 39.0 65000.0]]
Зависимая переменная
['Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No']


**Обработка пропущенных значений**

In [21]:
# устаревший подход
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X_without_nan = X.copy()
X_without_nan[:, 1:3] = imputer.transform(X[:, 1:3])
print(X_without_nan)

ImportError: cannot import name 'Imputer' from 'sklearn.preprocessing' (/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/__init__.py)

In [22]:
# Новая версия класса-трансформера, предыдущая Imputer - устарела
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X[:, 1:3])
X_without_nan = X.copy()
X_without_nan[:, 1:3] = imputer.transform(X[:, 1:3])
X_without_nan

array([['Russia', 65.0, 31000.0],
       ['France', 39.0, 26000.0],
       ['UK', 35.0, 34000.0],
       ['USA', 45.0, 51000.0],
       ['UK', 37.0, 51111.11111111111],
       ['France', 44.44444444444444, 49000.0],
       ['USA', 47.0, 52000.0],
       ['France', 48.0, 63000.0],
       ['UK', 45.0, 89000.0],
       ['France', 39.0, 65000.0]], dtype=object)

**Обработка категориальных данных**

Замена категории кодом (LabelEncoder)

In [23]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
print("Зависимая переменная до обработки")
print(y)
y = labelencoder_y.fit_transform(y)
print("Зависимая переменная после обработки")
print(y)

Зависимая переменная до обработки
['Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No']
Зависимая переменная после обработки
[1 0 0 0 1 1 1 1 0 0]


**Применение OneHotEncoder**

In [24]:
# устаревший подход к использованию OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
labelencoder_X = LabelEncoder()
labelencoder_X.fit_transform(X[:, 0])
X_encoded = X_without_nan.copy()
X_encoded[:, 0] = labelencoder_X.fit_transform(X_encoded[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X_encoded = onehotencoder.fit_transform(X_encoded).toarray()
print("Перекодировка категориального признака")
print(X_encoded)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'categorical_features'

In [25]:
# создаем копию "грязного" объекта: с пропусками и некодированными категориями
X_dirty = X.copy()
X_dirty

array([['Russia', 65.0, 31000.0],
       ['France', 39.0, 26000.0],
       ['UK', 35.0, 34000.0],
       ['USA', 45.0, 51000.0],
       ['UK', 37.0, nan],
       ['France', nan, 49000.0],
       ['USA', 47.0, 52000.0],
       ['France', 48.0, 63000.0],
       ['UK', 45.0, 89000.0],
       ['France', 39.0, 65000.0]], dtype=object)

In [26]:
# Современный метод трансформации признаков
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# создаем список трансформеров
transformers = [
    ('onehot', OneHotEncoder(), [0]),
    ('imp', SimpleImputer(), [1, 2])
]

# Создаем объект ColumnTransformer и передаем ему список трансформеров
ct = ColumnTransformer(transformers)

# Выполняем трансформацию признаков
X_transformed = ct.fit_transform(X_dirty)
print(X_transformed.shape)
X_transformed

(10, 6)


array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50000000e+01, 3.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.90000000e+01, 2.60000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        3.50000000e+01, 3.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        4.50000000e+01, 5.10000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        3.70000000e+01, 5.11111111e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.44444444e+01, 4.90000000e+04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        4.70000000e+01, 5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.80000000e+01, 6.30000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        4.50000000e+01, 

In [27]:
# Можно преобразовать полученный многомерный массив обратно в Dataframe
X_data = pd.DataFrame(
    X_transformed,
    columns=['C1', 'C2', 'C3', 'C4', 'Age', 'Salary'])
X_data

Unnamed: 0,C1,C2,C3,C4,Age,Salary
0,0.0,1.0,0.0,0.0,65.0,31000.0
1,1.0,0.0,0.0,0.0,39.0,26000.0
2,0.0,0.0,1.0,0.0,35.0,34000.0
3,0.0,0.0,0.0,1.0,45.0,51000.0
4,0.0,0.0,1.0,0.0,37.0,51111.111111
5,1.0,0.0,0.0,0.0,44.444444,49000.0
6,0.0,0.0,0.0,1.0,47.0,52000.0
7,1.0,0.0,0.0,0.0,48.0,63000.0
8,0.0,0.0,1.0,0.0,45.0,89000.0
9,1.0,0.0,0.0,0.0,39.0,65000.0
