In [65]:
import numpy as np
import pandas as pd
from io import StringIO
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler ,OneHotEncoder, StandardScaler

## 4.1

In [8]:
csv_data = '''A, B, C, D
              1.0, 2.0, 3.0, 4.0
              5.0, 6.0,, 8.0
              10.0, 11.0, 12.0,'''
df = pd.read_csv(StringIO(csv_data))
print(df)
print(df.isnull().sum())
print(df.dropna())
print(df.dropna(axis=1))

imr = SimpleImputer(missing_values=np.nan, strategy="mean")
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
print(imputed_data)

print(df.fillna(df.mean()))

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN
A     0
 B    0
 C    1
 D    1
dtype: int64
     A    B    C    D
0  1.0  2.0  3.0  4.0
      A     B
0   1.0   2.0
1   5.0   6.0
2  10.0  11.0
[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]
      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   7.5  8.0
2  10.0  11.0  12.0  6.0


## 4.2

In [41]:
df = pd.DataFrame([
    ["green", "M", 10.1, "class2"],
    ["red", "L", 13.5, "class1"],
    ["blue", "XL", 15.3, "class2"]
])
df.columns = ["color", "size", "price", "classlabel"]
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


In [42]:
size_mapping = {"XL": 3, "L": 2, "M": 1}
df["size"] = df["size"].map(size_mapping)
print(df)
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df["size"].map(inv_size_mapping)
print(df)

   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2
   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2


In [44]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df["classlabel"]))}
print(class_mapping)
df["classlabel"] = df["classlabel"].map(class_mapping)
print(df)

inv_class_mappping = {v: k for k, v in class_mapping.items()}
df["classlabel"] = df["classlabel"].map(inv_class_mappping)
print(df)

class_le = LabelEncoder()
y = class_le.fit_transform(df["classlabel"].values)
print(y)
print(class_le.inverse_transform(y))

{'class1': 0, 'class2': 1}
   color  size  price  classlabel
0  green     1   10.1           1
1    red     2   13.5           0
2   blue     3   15.3           1
   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2
[1 0 1]
['class2' 'class1' 'class2']


In [52]:
X = df[["color", "size", "price"]].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
print(X)

X = df[["color", "size", "price"]].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

# X = df[["color", "size", "price"]].values
c_transf = ColumnTransformer([("onhot", OneHotEncoder(), [0]), ("nothing", "passthrough", [1, 2])])
print(c_transf.fit_transform(X).astype(float))

print(pd.get_dummies(df[["price", "color", "size"]]))

print(pd.get_dummies(df[["price", "color", "size"]], drop_first=True))

[[1 1 10.1]
 [2 2 13.5]
 [0 3 15.3]]
[[ 0.   1.   0.   1.  10.1]
 [ 0.   0.   1.   2.  13.5]
 [ 1.   0.   0.   3.  15.3]]
   price  size  color_blue  color_green  color_red
0   10.1     1           0            1          0
1   13.5     2           0            0          1
2   15.3     3           1            0          0
   price  size  color_green  color_red
0   10.1     1            1          0
1   13.5     2            0          1
2   15.3     3            0          0


## 4.3

In [62]:
df_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
df_wine.columns = ["Class label", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phnols", "Flavanoids", "Nonflavanoid phnols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
print("Class labels", np.unique(df_wine["Class label"]))
print(df_wine.head())

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

Class labels [1 2 3]
   Class label  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0            1    14.23        1.71  2.43               15.6        127   
1            1    13.20        1.78  2.14               11.2        100   
2            1    13.16        2.36  2.67               18.6        101   
3            1    14.37        1.95  2.50               16.8        113   
4            1    13.24        2.59  2.87               21.0        118   

   Total phnols  Flavanoids  Nonflavanoid phnols  Proanthocyanins  \
0          2.80        3.06                 0.28             2.29   
1          2.65        2.76                 0.26             1.28   
2          2.80        3.24                 0.30             2.81   
3          3.85        3.49                 0.24             2.18   
4          2.80        2.69                 0.39             1.82   

   Color intensity   Hue  OD280/OD315 of diluted wines  Proline  
0             5.64  1.04                       

## 4.4

In [66]:
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
