# Categorical Variables

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Dataset

In [None]:
pd.read_csv("data/adult.data.txt", nrows=5, header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income'])

## One-Hot-Encoding (Dummy Variables)

In [None]:
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"

data = pd.read_csv(
    "data/adult.data.txt", header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income'])

In [None]:
# For illustration purposes, we only select some of the columns
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]

In [None]:
data.head()

In [None]:
data["gender"].value_counts()

In [None]:
print("Original features:\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))

In [None]:
data_dummies.head()

In [None]:
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

# Extract NumPy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print(f"X.shape: {X.shape} y.shape: {y.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)

print(f"Test score: {logreg.score(X_test, y_test):.2f}")

## Numbers Can Encode Categoricals

In [None]:
# create a DataFrame with an integer feature and a categorical string feature
demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1], 'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
demo_df

In [None]:
pd.get_dummies(demo_df)

In [None]:
pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature'])