# Data Science and Visualization (RUC F2023)

## Lecture 8: Clustering II

 # One-Hot-Encoding 

In [1]:
import mglearn
import pandas as pd

## 1. Original Data

In [2]:
data = pd.read_csv('C:/Data/adults.csv', index_col=0)
data.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [3]:
data.shape

(32561, 7)

Values of gender:

In [4]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

Values of income:

In [6]:
data.income.value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [7]:
data.columns

Index(['age', 'workclass', 'education', 'gender', 'hours-per-week',
       'occupation', 'income'],
      dtype='object')

### Feature selection

We select all attributes but the last.

In [5]:
X = data.iloc[:, 0:6]
X.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation
0,39,State-gov,Bachelors,Male,40,Adm-clerical
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial
2,38,Private,HS-grad,Male,40,Handlers-cleaners
3,53,Private,11th,Male,40,Handlers-cleaners
4,28,Private,Bachelors,Female,40,Prof-specialty


In [6]:
y = data['income'].values
y

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

### Train a model

We may want to train a classification model, e.g., logistic regression model, to predict the income of a new adult.

However, training such a model on the original data may cause problem due to the data types:

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
print(Y_pred)
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))

ValueError: could not convert string to float: ' Private'

## 2. One-Hot-Encoding

Apply the **data_dummies(.)** function on the whole dataset:

In [11]:
data_dummies = pd.get_dummies(data)
data_dummies.columns

Index(['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov',
       'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors',
       'education_ Doctorate', 'education_ HS-grad', 'education_ Masters',
       'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'gender_ Female', 'gender_ Male',
       'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners',
       'occupation_ Machine-op-inspct', 'occupation_ Other-service',
   

In [12]:
data_dummies.shape

(32561, 46)

In [13]:
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


### Feature selection on the encoded data 

In [14]:
# Let's predict the income using all other features
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

# Extract NumPy arrays
X = features.values
# Values in y are binary: 1 or 0
y = data_dummies['income_ >50K'].values

print("X.shape: {}  y.shape: {}".format(X.shape, y.shape))

X.shape: (32561, 44)  y.shape: (32561,)


### Modelling on the encoded data

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Remember to split the data into training and test after one-hot-encoding
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(y_pred)

# Test accuracy
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))
#metrics.accuracy_score(y_test, y_pred)

[0 0 0 ... 0 1 0]
Test score: 0.81


## 3. One-Hot-Encoding on Selected Columns

This example shows one-hot-encoding on selected columns instead of a whole dataset.

In [9]:
# create a DataFrame with an integer feature Gender and a categorical string feature Department
demo_df = pd.DataFrame({'Name': ['Alex Adam', 'Babara Brian', 'Cindy Carlsen', 'David Dickens'],
                        'Gender': [1, 0, 0, 1],
                        'Department': ['IMT', 'IKH', 'INM', 'ISE']})
demo_df

Unnamed: 0,Name,Gender,Department
0,Alex Adam,1,IMT
1,Babara Brian,0,IKH
2,Cindy Carlsen,0,INM
3,David Dickens,1,ISE


If we apply one-hot-encoding on the whole dataset, we will have two problems:
* Gender is ignored.
* Name is unnecessarily transformed.

In [10]:
pd.get_dummies(demo_df)

Unnamed: 0,Gender,Name_Alex Adam,Name_Babara Brian,Name_Cindy Carlsen,Name_David Dickens,Department_IKH,Department_IMT,Department_INM,Department_ISE
0,1,1,0,0,0,0,1,0,0
1,0,0,1,0,0,1,0,0,0
2,0,0,0,1,0,0,0,1,0
3,1,0,0,0,1,0,0,0,1


We want to treat the numeric value feature *Gender* as string feature in order to support one-hot-encoding:

In [11]:
demo_df['Gender'] = demo_df['Gender'].astype(str)

We specify the columns to be included for the one-hot-encoding:

In [12]:
pd.get_dummies(demo_df, columns=['Gender', 'Department'])

Unnamed: 0,Name,Gender_0,Gender_1,Department_IKH,Department_IMT,Department_INM,Department_ISE
0,Alex Adam,0,1,0,1,0,0
1,Babara Brian,1,0,1,0,0,0
2,Cindy Carlsen,1,0,0,0,1,0
3,David Dickens,0,1,0,0,0,1
