# Entry 8 notebook - Centering and Scaling Notebook

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.preprocessing import StandardScaler

### Custom functions

In [2]:
def convert_categoricals(df):
    cat_vars = df.select_dtypes('object').columns.tolist()
    converted = df.copy()
    converted[cat_vars] = df[cat_vars].astype('category')
    for cat in cat_vars:
        converted[cat] = converted[cat].cat.codes
    return converted

In [3]:
def feature_corr(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(df[target], axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return df_top, feature_corr

### Get data and preprocess

In [4]:
planets = pd.read_excel('../data/planets_moons.xlsx', index_col=0)
planets_num = convert_categoricals(planets)
df_top, feature_corr = feature_corr(planets_num, 'atmospheric_mass_kg')

#### Define features

I'll incorporate the correlated/collinear stuff in a notebook consolidating all data prep. For now, I'm hand coding it.

In [5]:
select_features = ['atmospheric_mass_kg', 'diameter_km', 'mass_1024kg', 'V(1,0) (mag)', 'day_len_hr', 'surface_pressure_bars', 'rings', 'geometric_albedo', 'density_kg_m3', 'magnetic_field', 'orbital_inclination_degrees', 'type', 'rotation_period_hr']
select_df = df_top[select_features]

#### Standardize features

The default for sklearn.preprocessing.StandardScaler is to divide the centered value by the standard deviation (ie: Standardization as defined in Entry 8)

In [6]:
scaler = StandardScaler()
columns = select_df.columns.tolist()
planets_stdz = scaler.fit_transform(select_df)
planets_stdz = pd.DataFrame(planets_stdz, columns=columns)
planets_stdz

Unnamed: 0,atmospheric_mass_kg,diameter_km,mass_1024kg,"V(1,0) (mag)",day_len_hr,surface_pressure_bars,rings,geometric_albedo,density_kg_m3,magnetic_field,orbital_inclination_degrees,type,rotation_period_hr
0,-0.436378,-0.681977,-0.442264,1.224812,2.56817,-0.638729,-0.755929,-1.374401,1.429504,0.912871,0.690697,0.565685,0.959204
1,-0.436377,-0.53059,-0.43397,0.075589,1.514217,-0.429988,-0.755929,1.732669,1.323681,-1.095445,-0.075454,0.565685,-3.070742
2,-0.436375,-0.516928,-0.431961,0.256733,-0.546799,-0.636428,-0.755929,0.116307,1.47954,0.912871,-0.799041,0.565685,0.189016
3,-0.436378,-0.711396,-0.442733,1.37923,-0.038815,-0.638729,-0.755929,-1.29444,0.229214,-1.095445,0.286339,-0.989949,0.540685
4,-0.436378,-0.641894,-0.441694,0.951612,-0.546279,-0.638706,-0.755929,-1.123094,0.570264,-1.095445,-0.394684,0.565685,0.189406
5,3.036693,2.211778,3.024239,-1.388408,-0.557259,-0.634191,1.322876,0.990171,-0.929091,0.912871,-0.522376,0.565685,0.181224
6,0.550706,1.741418,0.594708,-1.233991,-0.556666,1.6302,1.322876,0.704595,-1.296597,0.912871,-0.266992,0.565685,0.181669
7,-0.436377,-0.676311,-0.442636,-1.002364,-0.281196,-0.635099,-0.755929,-0.780402,-0.609321,-1.095445,-0.735195,-0.989949,0.38834
8,-0.279175,0.286882,-0.284308,-0.732134,-0.551843,1.6302,1.322876,0.933056,-0.960723,0.912871,-0.628786,0.565685,0.166139
9,-0.253584,0.253567,-0.256541,-0.637107,-0.55266,1.6302,1.322876,0.361903,-0.749652,0.912871,-0.415966,0.565685,0.184675
