## Linear regression on categorical variable using one-hot and dummy coding

In [5]:
import pandas as pd
from sklearn import linear_model

df = pd.DataFrame({ 'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC', 'Seattle', 'Seattle', 'Seattle'],
                    'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]})
df['Rent'].mean()

3333.3333333333335

### Converting the categorical variables in the DataFrame to one-hot encoding


In [2]:
one_hot_df = pd.get_dummies(df, prefix=['city'])
one_hot_df

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,False,True,False
1,4000,False,True,False
2,4001,False,True,False
3,3499,True,False,False
4,3500,True,False,False
5,3501,True,False,False
6,2499,False,False,True
7,2500,False,False,True
8,2501,False,False,True


### Fitting a linear regression model

In [8]:
model = linear_model.LinearRegression()
model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']], one_hot_df['Rent'])
model.coef_

array([ 166.66666667,  666.66666667, -833.33333333])

In [9]:
model.intercept_

3333.3333333333335

### Training a linear regression model on dummy code

In [10]:
dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)
dummy_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,True,False
1,4000,True,False
2,4001,True,False
3,3499,False,False
4,3500,False,False
5,3501,False,False
6,2499,False,True
7,2500,False,True
8,2501,False,True


In [11]:
model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])
model.coef_

array([  500., -1000.])

In [12]:
model.intercept_

3500.0

In [15]:
effect_df = dummy_df.copy()
effect_df.loc[3:5, ['city_SF', 'city_Seattle']] = -1.0
effect_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,True,False
1,4000,True,False
2,4001,True,False
3,3499,-1.0,-1.0
4,3500,-1.0,-1.0
5,3501,-1.0,-1.0
6,2499,False,True
7,2500,False,True
8,2501,False,True


In [16]:
model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])
model.coef_

array([ 666.66666667, -833.33333333])

In [17]:
model.intercept_

3333.3333333333335

## Ordinal Encoding


In [21]:
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder

data = asarray([["green"], ["red"], ["blue"]])

print(data)

encoder = OrdinalEncoder()

results = encoder.fit_transform(data)

results

[['green']
 ['red']
 ['blue']]


array([[1.],
       [2.],
       [0.]])

## One-Hot Encoding

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)

results = encoder.fit_transform(data)

results


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

## Dummy Variable Encoding

In [27]:
encoder = OneHotEncoder(drop="first", sparse_output=False)

results = encoder.fit_transform(data)

results

array([[1., 0.],
       [0., 1.],
       [0., 0.]])

# Breast Cancer Dataset

In [43]:
from pandas import read_csv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

dataset = read_csv(url, header=None)

data = dataset.values

X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

print("Input: ", X.shape)
print("Output: ", y.shape)

ordinal = OrdinalEncoder()

X = ordinal.fit_transform(X)

label = LabelEncoder()

y = label.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

ordinal.fit(X_train)
X_train = ordinal.transform(X_train)
X_test = ordinal.transform(X_test)

label.fit(y_train)
y_train = label.transform(y_train)
y_test = label.transform(y_test)

model = LogisticRegression()
model.fit(X_train, y_train)

yhat = model.predict(X_test)

accuracy = accuracy_score(y_test, yhat)

print('Accuracy: %.2f' % (accuracy * 100))

Input:  (286, 9)
Output:  (286,)
Accuracy: 75.79


### OneHotEncoder Transform

In [44]:
onehot_encoder = OneHotEncoder(sparse_output=False)
X = onehot_encoder.fit_transform(X)

print("Input: ", X.shape)

onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)


model.fit(X_train, y_train)

yhat = model.predict(X_test)

accuracy = accuracy_score(y_test, yhat)

print('Accuracy: %.2f' % (accuracy * 100))

Input:  (286, 43)
Accuracy: 70.53
