In [1]:
### Tutorial Overview
# 1.Nominal and Ordinal Variables
# 2.Encoding Categorical Data
# 3.Breast Cancer data
# 4.Ordinal Encoder Transform
# 5.OneHotEncoder transform

### Encoding Categorical Data

In [2]:
# Ordinal Encoding
# One Hot Encoding
# Dummy Variable Encoding

#### Ordinal Encoding

In [3]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('../dataset/breast-cancer.csv',header=None)

data = dataset.values

X = data[:,:-1]
y = data[:,-1]

print(X.shape,y.shape)


(286, 9) (286,)


In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv('../dataset/breast-cancer.csv',header=None)

data = dataset.values

X = data[:,:-1].astype('str')
y = data[:,-1]

print(X.shape,y.shape)

ordina_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()

X = ordina_encoder.fit_transform(X)
y = label_encoder.fit_transform(y)

print(X.shape)
print(y.shape)

print(X[2:5,:])
print(y[:5])

(286, 9) (286,)
(286, 9)
(286,)
[[3. 0. 6. 0. 0. 1. 0. 1. 0.]
 [2. 2. 6. 0. 1. 2. 1. 1. 1.]
 [2. 2. 5. 4. 1. 1. 0. 4. 0.]]
[1 0 1 0 1]


In [7]:
## Evaluating the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dataset = pd.read_csv('../dataset/breast-cancer.csv',header=None)

data = dataset.values

X = data[:,:-1].astype('str')
y = data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

ordinal_enc = OrdinalEncoder()
label_enc = LabelEncoder()

ordinal_enc.fit(X_train)
label_enc.fit(y_train)

X_train = ordinal_enc.transform(X_train)
X_test = ordinal_enc.transform(X_test)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

model = LogisticRegression()

model.fit(X_train,y_train)

yhat =  model.predict(X_test)

accr = accuracy_score(y_test,yhat)*100

print("Accuracy : %.3f"%(accr))

Accuracy : 75.789


#### One Hot Encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv('../dataset/breast-cancer.csv',header=None)

data = dataset.values

X = data[:,:-1].astype('str')
y = data[:,-1]

print(X.shape,y.shape)

ordina_encoder = OneHotEncoder(sparse=False)
label_encoder = LabelEncoder()

X = ordina_encoder.fit_transform(X)
y = label_encoder.fit_transform(y)

print(X.shape)
print(y.shape)

print(X[2:5,:])
print(y[:5])

(286, 9) (286,)
(286, 43)
(286,)
[[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]]
[1 0 1 0 1]


In [10]:
dataset = pd.read_csv('../dataset/breast-cancer.csv',header=None)

data = dataset.values

X = data[:,:-1].astype('str')
y = data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

onehot_enc = OneHotEncoder(sparse=False)
label_enc = LabelEncoder()

onehot_enc.fit(X_train)
label_enc.fit(y_train)

X_train = onehot_enc.transform(X_train)
X_test = onehot_enc.transform(X_test)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

model = LogisticRegression()

model.fit(X_train,y_train)

yhat =  model.predict(X_test)

accr = accuracy_score(y_test,yhat)*100

print("Accuracy : %.3f"%(accr))

Accuracy : 70.526
