In [1]:
import numpy as np
import pandas as pd 
from scipy import stats
import plotly.graph_objects as go
from sklearn.datasets import load_wine
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from plotly.subplots import make_subplots

## Quantization

In [2]:
#15 random integers from the "discrete uniform" distribution
ages = np.random.randint(0, 100, 15)
#evenly spaced bins
ages_binned = np.floor_divide(ages, 10)

print(f"Ages: {ages} \nAges Binned: {ages_binned} \n")


#numbers spanning several magnitudes
views = [300, 5936, 2, 350, 10000, 743, 2854, 9113, 25, 20000, 160, 683, 7245, 224]

#map count -> exponential width bins
views_exponential_bins = np.floor(np.log10(views))

print(f"Views: {views} \nViews Binned: {views_exponential_bins}")

Ages: [63 74 29 95 64 20 98 35 11 30 58  2 96 55 40] 
Ages Binned: [6 7 2 9 6 2 9 3 1 3 5 0 9 5 4] 

Views: [300, 5936, 2, 350, 10000, 743, 2854, 9113, 25, 20000, 160, 683, 7245, 224] 
Views Binned: [2. 3. 0. 2. 4. 2. 3. 3. 1. 4. 2. 2. 3. 2.]


In [3]:
#map the counts to quantiles (adaptive binning)
views_adaptive_bin = pd.qcut(views, 5, labels=False)
print(f"Adaptive bins: {views_adaptive_bin}")

Adaptive bins: [1 3 0 1 4 2 3 4 0 4 0 2 3 1]


## Feature Scaling

In [4]:
wine_json= load_wine() # load in dataset
df = pd.DataFrame(data=wine_json["data"], columns=wine_json["feature_names"]) # create pandas dataframe
df["Target"] = wine_json["target"] # created new column and added target labelsd
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [5]:
# standardization
std_scaler = StandardScaler().fit(df[["alcohol", "malic_acid"]]) 
df_std = std_scaler.transform(df[["alcohol", "malic_acid"]])

# minmax scaling
minmax_scaler = MinMaxScaler().fit(df[["alcohol", "malic_acid"]])
df_minmax = minmax_scaler.transform(df[["alcohol", "malic_acid"]])

# l2 normalization
l2norm = Normalizer().fit(df[["alcohol", "malic_acid"]])
df_l2norm = l2norm.transform(df[["alcohol", "malic_acid"]])

In [6]:
trace1 = go.Scatter(x= df_std[:, 0],
                      y= df_std[:, 1],
                      mode= "markers",
                      name= "Standardized Scale")

trace2 = go.Scatter(x= df_minmax[:, 0],
                      y= df_minmax[:, 1],
                      mode= "markers",
                      name= "MinMax Scale")

trace3 = go.Scatter(x= df_l2norm[:, 0],
                      y= df_l2norm[:, 1],
                      mode= "markers",
                      name= "L2 Norm Scale")

trace4 = go.Scatter(x= df["alcohol"],
                    y= df["malic_acid"],
                    mode= "markers",
                    name= "Original Scale")

layout = go.Layout(
    title= "Effects of Feature scaling",
    xaxis=dict(title= "Alcohol"),
    yaxis=dict(title= "Malic Acid")
)


data = [trace1, trace2, trace3, trace4] 
fig = go.Figure(data=data, layout=layout)

fig.show()

## Feature Interactions

In [10]:
# dummy data 
X = np.arange(10).reshape(5, 2)
X.shape

(5, 2)

In [11]:
# interactions between features only
interactions = PolynomialFeatures(interaction_only=True)
X_interactions= interactions.fit_transform(X)
X_interactions.shape

(5, 4)

In [12]:
# polynomials to the 2nd degree
polynomial = PolynomialFeatures(2)
X_poly = polynomial.fit_transform(X)
X_poly.shape

(5, 6)

# Feature engineering for House prices data

In [5]:
df = pd.read_csv("../data/raw/train.csv")

In [6]:
x_log = np.log(df["GrLivArea"].copy()) # log transformation
x_square_root = np.sqrt(df["GrLivArea"].copy()) # square root transformation
x_boxcox, _ = stats.boxcox(df["GrLivArea"].copy()) # boxcox transformation
x = df["GrLivArea"].copy() # original data

fig = make_subplots(
    rows=2, cols=2,
    horizontal_spacing=0.125,
    vertical_spacing=0.125,
    subplot_titles=("Original Skewed Data", "Log Transformation", "Square root transformation", "Boxcox Transformation")
        )

fig.add_traces([
    go.Histogram(x=x,
                 hoverinfo="x",
                 showlegend=False),
    go.Histogram(x=x_log,
                 hoverinfo="x",
                 showlegend=False),
    go.Histogram(x=x_square_root, 
                 hoverinfo="x",
                 showlegend=False),
    go.Histogram(x=x_boxcox,
                 hoverinfo="x",
                 showlegend=False),
        ],
    rows=[1, 1, 2, 2],
    cols=[1, 2, 1, 2]
)

fig.update_layout(
    title=dict(
        text="GrLivArea Distribution with various Power Transforms",
        font=dict(
            family="Arial",
            size=20)),
	showlegend=False,
	width=800,
	height=500
    )

fig.show()