Goal is to convert categorical features into 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.compose import make_column_selector as col_selector

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv("population.csv")

#print(df.head())
#print(df.shape)

In [3]:
print(df.columns)
#print(df.dtypes)

Index(['Age', 'Job Type', 'Final Weight', 'Education', 'Education Number',
       'Marital Status', 'Job Title', 'Relationship', 'Race', 'Gender',
       'Capital Gain', 'Capital Loss', 'Hours per week', 'Country', 'Income'],
      dtype='object')


In [4]:
#df.isnull().sum()

In [5]:
#print(df["Job Type"].value_counts())
#print(df["Country"].value_counts())
#print(df["Job Title"].value_counts())
#print(df["Marital Status"].value_counts())
print(df["Race"].value_counts())

 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: Race, dtype: int64


In [6]:
df["Job Type"] = df["Job Type"].str.replace("?", "Other")
df["Country"] = df["Country"].str.replace("?", "Other")
df["Job Title"] = df["Job Title"].str.replace("?", "Other")

#print(df["Job Type"].unique())

In [7]:
dfy = df["Income"]
df = df.drop(["Income"], axis=1)
df1 = df.copy(deep=True)


### Creating categorical_cols and int_cols objects

We use col_selector to do this. 
We also create categorical_features and numerical_features.

In [8]:
# categorical_cols_obj is a categorical object
categorical_cols_obj = col_selector(dtype_include=object)
print(type(categorical_cols_obj))

# categorical_features is a list of  column names of features with catergorical data
categorical_features = categorical_cols_obj(df)
print(type(categorical_features))

# numeric_features is a list of column names of features with numeric data
int_cols_obj = col_selector(dtype_include="int64")
numeric_features = int_cols_obj(df)

print(categorical_features)
print(numeric_features)

<class 'sklearn.compose._column_transformer.make_column_selector'>
<class 'list'>
['Job Type', 'Education', 'Marital Status', 'Job Title', 'Relationship', 'Race', 'Gender', 'Country']
['Age', 'Final Weight', 'Education Number', 'Capital Gain', 'Capital Loss', 'Hours per week']


### Creating data frame with categorical_features

Creating an OrdinalEncoder object and performing a fit_transform on the df_cat dataframe.

In the below example, we are not scaling columns with numeric_features. 

We are performing hstack to put encoded categorical dataframe with the numeric dataframe.

Since our target variable is categorical, we perform LabelEncoder on this. 

In [9]:
df_cat = df[categorical_features]
ob_encoded = preprocessing.OrdinalEncoder()
cat_encoded = ob_encoded.fit_transform(df_cat)

x = np.hstack([cat_encoded, np.array(df[numeric_features])])
print(x.shape)

yb = preprocessing.LabelEncoder()
y = yb.fit_transform(dfy)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1)

(32561, 14)


### Pandas cat.codes

Using Pandas cat.codes for Catgeorical feature conversion. 
Then Using StandardScaler for standarizing the numeric features.

In [10]:
"""
Recall - we already have categorical features and numeric_features from the previous cell.
Here it is again.


categorical_cols_obj = col_selector(dtype_include=object)
print(type(categorical_cols_obj))
categorical_features = categorical_cols_obj(df)
print(type(categorical_features))

int_cols_obj = col_selector(dtype_include="int64")
numeric_features = int_cols_obj(df)

print(categorical_features)
print(numeric_features)

"""


for i in range(len(categorical_features)):
    df1[categorical_features[i]] = df1[categorical_features[i]].astype('category')
    df1[categorical_features[i]] = df1[categorical_features[i]].cat.codes


sc = StandardScaler()

x = np.hstack([df1[categorical_features], sc.fit_transform(df1[numeric_features])])

print(x.shape)


"""
Recall

our y is 

yb = preprocessing.LabelEncoder()
y = yb.fit_transform(dfy)
"""


x_train, x_test, y_train, y_test = train_test_split(
                                   x, y, test_size = 0.2, random_state=1)

clf = LogisticRegression(max_iter=500)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

(32561, 14)
Accuracy: 0.8277291570704745


### Using Standard Scaler and make_pipeline

Performing StandardScaler only on numeric columns and then using 
make_pipeline.

Syntax for make_pipeline(list of estimator objects, model)

The numeric columns in our dataset are:
'Age', 'Final Weight', 'Education Number', 'Capital Gain', 'Capital Loss', 'Hours per week'

In [11]:
lst = ['Age', 'Final Weight', 'Education Number', 'Capital Gain', 
       'Capital Loss', 'Hours per week']

x_train, x_test, y_train, y_test = \
    train_test_split(df[lst], y, test_size = 0.2, random_state=1)


st = StandardScaler()

"""
Example of Pipeline. Here we have to give the name of the estimator as a string 
followed by the estimator. 
In make_pipeline, we can just provide the list of estimators. 


clf = Pipeline([('preprocessor', preprocessor),
                  ('classifier', LogisticRegression(max_iter=500))])
"""


mpipe = make_pipeline(st, LogisticRegression())

mpipe.fit(x_train, y_train)

ym_pred = mpipe.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_test, ym_pred))

Accuracy: 0.8182097343773991


### Column Transform  

#### Encoding using Oridnal and scaling the numeric columns

Performing StandardScaler and OrdinalEncoder 
on numeric columns and categorical columns respectively and 
then using make_pipeline.


In [12]:
np.random.seed(0)

X = df

preprocessor = ColumnTransformer(
               transformers=[
               ('num', StandardScaler(), numeric_features),
               ('cat', OrdinalEncoder(), categorical_features)])

#clf = Pipeline(steps=[('preprocessor', preprocessor),
#                      ('classifier', LogisticRegression())])

clf = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.818


### OneHotEncoder


Performing StandardScaler and OneHotEncoder 
on numeric columns and categorical columns respectively and 
then using make_pipeline.


In [13]:
np.random.seed(0)


X = df
print(X.shape)


preprocessor = ColumnTransformer(
               transformers=[
               ('num', StandardScaler(), numeric_features),
               ('cat', OneHotEncoder(), categorical_features)])


#clf = Pipeline(steps=[('preprocessor', preprocessor),
#                      ('classifier', LogisticRegression())])

clf = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

(32561, 14)
model score: 0.847
