# Transforming Numerical Variables

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats

## Transforming variables with the logarithm

In [None]:
import scipy.stats as stats
from sklearn.datasets import load_boston
from sklearn.preprocessing import FunctionTransformer

In [None]:
data = pd.read_csv("data/boston.csv")
data.head()

In [None]:
def diagnostic_plots(df, variable):
    plt.figure(figsize=(12,4))
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.show()

In [None]:
diagnostic_plots(data, 'LSTAT')

In [None]:
data_tf = data.copy()

In [None]:
data_tf[['LSTAT', 'NOX', 'DIS', 'RM']] = np.log(data[['LSTAT', 'NOX', 'DIS', 'RM']])

In [None]:
diagnostic_plots(data_tf, 'LSTAT')

In [None]:
transformer = FunctionTransformer(np.log)

In [None]:
data_tf = transformer.transform(data[['LSTAT', 'NOX', 'DIS', 'RM']])

## Transforming variables with the reciprocal function

In [None]:
from sklearn.preprocessing import FunctionTransformer


data = pd.read_csv("data/boston.csv")

In [None]:
def diagnostic_plots(df, variable):
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    plt.figure(figsize=(10,4))
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.show()

In [None]:
diagnostic_plots(data, 'DIS')

In [None]:
transformer = FunctionTransformer(np.reciprocal)

In [None]:
data_tf = transformer.transform(data[['LSTAT', 'NOX', 'DIS', 'RM']])

In [None]:
data_tf = pd.DataFrame(data_tf, columns=['LSTAT', 'NOX', 'DIS', 'RM'])
diagnostic_plots(data_tf, 'DIS')

## Using power transformations on numerical variables

In [None]:
from sklearn.preprocessing import FunctionTransformer

data = pd.read_csv("data/boston.csv")

In [None]:
data.head()

In [None]:
data.hist(bins=30, figsize=(10,10))
plt.show()

In [None]:
diagnostic_plots(data, 'LSTAT')

In [None]:
# make a copy of the dataframe where we will store the modified
# variables
data_tf = data.copy()

In [None]:
transformer = FunctionTransformer(lambda x: np.power(x, 0.3))

# capture variables to transform in a list
cols = ['LSTAT', 'NOX', 'DIS', 'RM']

# transform slice of dataframe with indicated variables
data_tf = transformer.transform(data[cols])

data_tf = pd.DataFrame(data_tf, columns=cols)

In [None]:
# visualize the transformation (not in book)
diagnostic_plots(data_tf, 'LSTAT')

## Using square and cube root to transform variables

In [None]:
data = pd.read_csv("data/boston.csv")

data_tf = data.copy()

transformer = FunctionTransformer(np.sqrt)

# make a list of variables to transform
cols = ['LSTAT', 'NOX', 'DIS', 'RM']

# transform slice of dataframe with indicated variables
# returns NumPy array
data_tf = transformer.transform(data[cols])

data_tf = pd.DataFrame(data_tf, columns=cols)

In [None]:
diagnostic_plots(data_tf, 'LSTAT')

## Example: Customer churn dataset

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
df = pd.read_csv('data/sampleSubmission.csv')
df = df.rename(columns=str.lower)
df['churn'] = df['churn'].replace(('yes', 'no'), (1, 0))
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
X = df.drop(['churn'], axis=1)
y = df['churn']

### Simple baseline

In [None]:
def get_score(model, X_train, X_test, y_train, y_test):
    predictions = model.predict(X_test)
    print(f"------------- {type(model).__name__} -------------")
    print(f"Training set score: {model.score(X_train, y_train):.2f}")
    print(f"Test set score: {model.score(X_test, y_test):.2f}")
    print('Accuracy: ', accuracy_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
get_score(knn, X_train, X_test, y_train, y_test)

log = LogisticRegression(max_iter=10000)
log.fit(X_train, y_train)
get_score(log, X_train, X_test, y_train, y_test)

tree = DecisionTreeClassifier(max_depth=10)
tree.fit(X_train, y_train)
get_score(tree, X_train, X_test, y_train, y_test)

### With feature engineering


In [None]:
def get_total_net_minutes(df):
    df['total_net_minutes'] = df['total_day_minutes'] + df['total_eve_minutes'] + df['total_night_minutes']
    return df

def get_total_net_calls(df):
    df['total_net_calls'] = df['total_day_calls'] + df['total_eve_calls'] + df['total_night_calls']
    return df

def get_total_net_charge(df):
    df['total_net_charge'] = df['total_day_charge'] + df['total_eve_charge'] + df['total_night_charge']
    return df

def cs_calls_per_month(df):
    df['cs_calls_per_month'] = (df['number_customer_service_calls'] + df['number_vmail_messages']) / df['account_length']
    return df

In [None]:
feature_engineering = ColumnTransformer([
    ('total_net_minutes', FunctionTransformer(get_total_net_minutes, validate=False),
     ['total_day_minutes', 'total_eve_minutes', 'total_night_minutes']),
    ('total_net_calls', FunctionTransformer(get_total_net_calls, validate=False),
     ['total_day_calls', 'total_eve_calls', 'total_night_calls']),
    ('total_net_charge', FunctionTransformer(get_total_net_charge, validate=False),
     ['total_day_charge', 'total_eve_charge', 'total_night_charge']),
    ('cs_calls_per_month', FunctionTransformer(cs_calls_per_month, validate=False),
     ['account_length', 'number_customer_service_calls', 'number_vmail_messages']),
])

In [None]:
categorical_columns = list(X_train.select_dtypes(include=['object']).columns.values.tolist())
numeric_columns = list(X_train.select_dtypes(exclude=['object']).columns.values.tolist())

In [None]:
numeric_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('feature_engineering', feature_engineering, numeric_columns),
        ('numeric_transformers', numeric_transformer, numeric_columns),
        ('categorical_transformers', categorical_transformer, categorical_columns),
    ])

In [None]:
model = DecisionTreeClassifier(max_depth=10)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
get_score(pipeline, X_train, X_test, y_train, y_test)