In [50]:
import numpy as np
import pandas as pd

In [51]:
df = pd.read_csv("./Dataset/customer.csv")

In [52]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        50 non-null     int64 
 1   gender     50 non-null     object
 2   review     50 non-null     object
 3   education  50 non-null     object
 4   purchased  50 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [54]:
df['review'].value_counts()

review
Poor       18
Good       18
Average    14
Name: count, dtype: int64

In [55]:
df['education'].value_counts()

education
PG        18
School    16
UG        16
Name: count, dtype: int64

### Train Test Split

In [56]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Column Transformer

In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [58]:
transformer = ColumnTransformer(transformers=[
    # (name, transformer columns)
    ('trnf1', OneHotEncoder(drop='first'), ['gender']),
    ('trnf2', OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']]), ['review', 'education']),
], remainder='passthrough')

In [59]:
transformer.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [60]:
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)



In [63]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [64]:
print(X_train)

      0    1    2     3
0   1.0  2.0  0.0  39.0
1   1.0  2.0  0.0  18.0
2   0.0  2.0  1.0  39.0
3   0.0  2.0  0.0  57.0
4   1.0  0.0  1.0  75.0
5   0.0  2.0  2.0  30.0
6   1.0  2.0  0.0  19.0
7   1.0  0.0  2.0  97.0
8   0.0  0.0  2.0  64.0
9   1.0  0.0  1.0  59.0
10  1.0  0.0  0.0  51.0
11  0.0  2.0  1.0  25.0
12  0.0  2.0  2.0  72.0
13  1.0  1.0  0.0  86.0
14  1.0  1.0  1.0  73.0
15  1.0  1.0  2.0  32.0
16  0.0  0.0  1.0  22.0
17  0.0  1.0  1.0  83.0
18  1.0  2.0  1.0  74.0
19  0.0  0.0  0.0  60.0
20  0.0  1.0  2.0  16.0
21  1.0  2.0  2.0  23.0
22  0.0  0.0  0.0  22.0
23  0.0  1.0  1.0  65.0
24  0.0  0.0  2.0  18.0
25  1.0  0.0  0.0  74.0
26  0.0  2.0  1.0  34.0
27  0.0  2.0  2.0  38.0
28  1.0  0.0  2.0  27.0
29  1.0  0.0  2.0  15.0
30  1.0  1.0  2.0  94.0
31  0.0  1.0  0.0  31.0
32  0.0  2.0  2.0  70.0
33  1.0  2.0  1.0  74.0
34  0.0  2.0  0.0  96.0
35  0.0  1.0  0.0  57.0
36  1.0  1.0  1.0  92.0
37  1.0  0.0  2.0  61.0
38  0.0  0.0  1.0  68.0
39  0.0  1.0  1.0  16.0
