# Entry 10 - Reorder Pre-processing and Make Prediction

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce


from sklearn.compose import ColumnTransformer
# from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, #OneHotEncoder
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, GridSearchCV

### Custom functions

In [2]:
def feature_corr(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return df_top, feature_corr

In [3]:
def feature_coll(correlation_matrix, df_top):
    collinear_features = set()
    for i in range(len(correlation_matrix.columns)):
        sliced_matrix = correlation_matrix.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = correlation_matrix.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    print('Collinear features:', collinear_features)
    df = df_top.drop(collinear_features, axis=1)
    return df

In [None]:
def preprocess_data(df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    cat_features = df.select_dtypes('object').columns.tolist()
    num_features = df.select_dtypes('number').columns.tolist()
    
    cat_encoder = encoder(cols=cat_features)
    df = cat_encoder.fit_transform(df)
    columns = df.columns.tolist()
    index = df.index.tolist()
    df = scaler.fit_transform(df)
    df = pd.DataFrame(df, columns=columns, index=index)
    return scaler, cat_encoder, df

# stdz_scaler, cat_encoder, stdz_df = preprocess_data(raw_df)
# planets = stdz_df.loc[:, 'type':'atmospheric_mass_kg']
# planets

In [15]:
index = planets.index.tolist()
scaler = StandardScaler()
num_features = planets.select_dtypes('number').columns.tolist()
pd.DataFrame(scaler.fit_transform(planets[num_features]), columns=num_features, index=index)

Unnamed: 0,mass_1024kg,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,aphelion_106 km,...,orbital_eccentricity,obliquity_to_orbit_degrees,mean_temp_c,surface_pressure_bars,nbr_moons,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
Mercury,-0.442264,-0.681977,1.429504,-0.606357,-0.686002,0.959204,2.56817,-0.809602,-0.8551,-0.762454,...,1.74137,-0.823535,1.083267,-0.638729,-0.636013,-0.681959,-0.68545,1.224812,-1.374401,-0.436378
Venus,-0.43397,-0.53059,1.323681,0.241653,-0.3311,-3.070742,1.514217,-0.783215,-0.817084,-0.744951,...,-0.890624,2.320273,2.605431,-0.429988,-0.636013,-0.530622,-0.529849,0.075589,1.732669,-0.436377
Earth,-0.431961,-0.516928,1.47954,0.388424,-0.284556,0.189016,-0.546799,-0.761496,-0.792606,-0.725612,...,-0.757695,-0.409373,0.304247,-0.636428,-0.602698,-0.516946,-0.516099,0.256733,0.116307,-0.436375
Moon,-0.442733,-0.711396,0.229214,-0.948822,-0.796545,0.540685,-0.038815,-0.761496,-0.792606,-0.725612,...,-0.252565,-0.70538,0.124867,-0.638729,-0.636013,-0.711421,-0.715688,1.37923,-1.29444,-0.436378
Mars,-0.441694,-0.641894,0.570264,-0.606357,-0.645276,0.189406,-0.546279,-0.720419,-0.755826,-0.682144,...,0.265858,-0.377468,-0.105764,-0.638706,-0.569383,-0.64191,-0.644525,0.951612,-1.123094,-0.436378
Jupiter,3.024239,2.211778,-0.929091,2.557372,2.525566,0.181224,-0.557259,-0.43152,-0.425797,-0.42814,...,-0.332322,-0.76919,-0.336395,-0.634191,1.99587,2.211788,2.220835,-1.388408,0.990171,3.036693
Saturn,0.594708,1.741418,-1.296597,0.257961,1.129232,0.181669,-0.556666,-0.087957,-0.047429,-0.115717,...,-0.225979,-0.35088,-0.490149,1.6302,2.095815,1.741423,1.717771,-1.233991,0.704595,0.550706
Titan,-0.442636,-0.676311,-0.609321,-0.981438,-0.784909,0.38834,-0.281196,-0.087957,-0.047429,-0.115717,...,-0.59818,-0.818465,-0.690029,-0.635099,-0.636013,-0.676337,-0.679622,-1.002364,-0.780402,-0.436377
Uranus,-0.284308,0.286882,-0.960723,0.209037,0.303068,0.166139,-0.551843,0.666948,0.810993,0.550897,...,-0.372201,0.909365,-0.772031,1.6302,0.263491,0.286869,0.301922,-0.732134,0.933056,-0.279175
Neptune,-0.256541,0.253567,-0.749652,0.584119,0.431065,0.184675,-0.55266,1.51817,1.863822,1.241237,...,-0.837453,-0.32252,-0.797657,1.6302,-0.169604,0.253553,0.270047,-0.637107,0.361903,-0.253584


In [None]:
def preprocess_data(df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = df.index.tolist()
    
    num_features = df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(df[num_features])
    num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    cat_features = df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(df[cat_features])
    cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    

In [16]:
df = X_train.copy()

index = df.index.tolist()

num_features = df.select_dtypes('number').columns.tolist()
num_scale = scaler.fit_transform(df[num_features])
num_df = pd.DataFrame(num_scale, columns=num_features, index=index)

In [17]:
num_features

['diameter_km',
 'density_kg_m3',
 'gravity_m_s2',
 'escape_vel_km_s',
 'rotation_period_hr',
 'day_len_hr',
 'distance_from_sun_106_km',
 'perihelion_106\xa0km',
 'aphelion_106\xa0km',
 'orbital_period_days',
 'orbital_velocity_km_s',
 'orbital_inclination_degrees',
 'orbital_eccentricity',
 'obliquity_to_orbit_degrees',
 'mean_temp_c',
 'surface_pressure_bars',
 'nbr_moons',
 'equatorial_radius_km',
 'mean_radius_km',
 'V(1,0) (mag)',
 'geometric_albedo',
 'atmospheric_mass_kg']

In [18]:
num_scale

array([[-0.68197725,  1.4295041 , -0.6063566 , -0.68600203,  0.95920392,
         2.5681697 , -0.80960212, -0.85510048, -0.76245439, -0.61412582,
         2.12770588,  0.69069669,  1.7413697 , -0.82353459,  1.08326682,
        -0.63872894, -0.63601331, -0.68195941, -0.68545025,  1.22481202,
        -1.37440101, -0.43637751],
       [-0.53058968,  1.32368075,  0.24165312, -0.33110044, -3.07074188,
         1.51421747, -0.78321454, -0.8170844 , -0.74495079, -0.60942549,
         1.27109702, -0.07545426, -0.8906242 ,  2.3202734 ,  2.60543141,
        -0.42998752, -0.63601331, -0.53062153, -0.52984903,  0.07558902,
         1.7326695 , -0.43637663],
       [-0.51692813,  1.47954014,  0.38842403, -0.28455597,  0.18901622,
        -0.54679851, -0.76149594, -0.79260576, -0.72561178, -0.6045945 ,
         0.91187395, -0.79904127, -0.75769521, -0.40937281,  0.30424655,
        -0.63642825, -0.60269833, -0.51694571, -0.51609939,  0.25673269,
         0.11630745, -0.43637495],
       [-0.71139568

In [19]:
num_df

Unnamed: 0,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,aphelion_106 km,orbital_period_days,...,orbital_eccentricity,obliquity_to_orbit_degrees,mean_temp_c,surface_pressure_bars,nbr_moons,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
Mercury,-0.681977,1.429504,-0.606357,-0.686002,0.959204,2.56817,-0.809602,-0.8551,-0.762454,-0.614126,...,1.74137,-0.823535,1.083267,-0.638729,-0.636013,-0.681959,-0.68545,1.224812,-1.374401,-0.436378
Venus,-0.53059,1.323681,0.241653,-0.3311,-3.070742,1.514217,-0.783215,-0.817084,-0.744951,-0.609425,...,-0.890624,2.320273,2.605431,-0.429988,-0.636013,-0.530622,-0.529849,0.075589,1.732669,-0.436377
Earth,-0.516928,1.47954,0.388424,-0.284556,0.189016,-0.546799,-0.761496,-0.792606,-0.725612,-0.604594,...,-0.757695,-0.409373,0.304247,-0.636428,-0.602698,-0.516946,-0.516099,0.256733,0.116307,-0.436375
Moon,-0.711396,0.229214,-0.948822,-0.796545,0.540685,-0.038815,-0.761496,-0.792606,-0.725612,-0.616213,...,-0.252565,-0.70538,0.124867,-0.638729,-0.636013,-0.711421,-0.715688,1.37923,-1.29444,-0.436378
Mars,-0.641894,0.570264,-0.606357,-0.645276,0.189406,-0.546279,-0.720419,-0.755826,-0.682144,-0.59353,...,0.265858,-0.377468,-0.105764,-0.638706,-0.569383,-0.64191,-0.644525,0.951612,-1.123094,-0.436378
Jupiter,2.211778,-0.929091,2.557372,2.525566,0.181224,-0.557259,-0.43152,-0.425797,-0.42814,-0.468233,...,-0.332322,-0.76919,-0.336395,-0.634191,1.99587,2.211788,2.220835,-1.388408,0.990171,3.036693
Saturn,1.741418,-1.296597,0.257961,1.129232,0.181669,-0.556666,-0.087957,-0.047429,-0.115717,-0.247624,...,-0.225979,-0.35088,-0.490149,1.6302,2.095815,1.741423,1.717771,-1.233991,0.704595,0.550706
Titan,-0.676311,-0.609321,-0.981438,-0.784909,0.38834,-0.281196,-0.087957,-0.047429,-0.115717,-0.616601,...,-0.59818,-0.818465,-0.690029,-0.635099,-0.636013,-0.676337,-0.679622,-1.002364,-0.780402,-0.436377
Uranus,0.286882,-0.960723,0.209037,0.303068,0.166139,-0.551843,0.666948,0.810993,0.550897,0.434629,...,-0.372201,0.909365,-0.772031,1.6302,0.263491,0.286869,0.301922,-0.732134,0.933056,-0.279175
Neptune,0.253567,-0.749652,0.584119,0.431065,0.184675,-0.55266,1.51817,1.863822,1.241237,1.439029,...,-0.837453,-0.32252,-0.797657,1.6302,-0.169604,0.253553,0.270047,-0.637107,0.361903,-0.253584


In [20]:
encoder=ce.OrdinalEncoder

cat_features = df.select_dtypes('object').columns.tolist()
cat_encoder = encoder(cols=cat_features)
cat_encode = cat_encoder.fit_transform(df[cat_features])
cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)

In [21]:
cat_features

['type', 'rings', 'magnetic_field']

In [22]:
cat_encoder

OrdinalEncoder(cols=['type', 'rings', 'magnetic_field'], drop_invariant=False,
               handle_missing='value', handle_unknown='value',
               mapping=[{'col': 'type', 'data_type': dtype('O'),
                         'mapping': planet    1
moon      2
dwarf     3
NaN      -2
dtype: int64},
                        {'col': 'rings', 'data_type': dtype('O'),
                         'mapping': No     1
Yes    2
NaN   -2
dtype: int64},
                        {'col': 'magnetic_field', 'data_type': dtype('O'),
                         'mapping': Yes    1
No     2
NaN   -2
dtype: int64}],
               return_df=True, verbose=0)

In [23]:
cat_encode

Unnamed: 0_level_0,type,rings,magnetic_field
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mercury,1,1,1
Venus,1,1,2
Earth,1,1,1
Moon,2,1,2
Mars,1,1,2
Jupiter,1,2,1
Saturn,1,2,1
Titan,2,1,2
Uranus,1,2,1
Neptune,1,2,1


In [24]:
cat_df

Unnamed: 0,type,rings,magnetic_field
Mercury,1,1,1
Venus,1,1,2
Earth,1,1,1
Moon,2,1,2
Mars,1,1,2
Jupiter,1,2,1
Saturn,1,2,1
Titan,2,1,2
Uranus,1,2,1
Neptune,1,2,1


In [2]:
def convert_categoricals(df):
    cat_vars = df.select_dtypes('object').columns.tolist()
    converted = df.copy()
    converted[cat_vars] = df[cat_vars].astype('category')
    for cat in cat_vars:
        converted[cat] = converted[cat].cat.codes
    return converted

In [4]:
def feature_stdz(df, scaler = StandardScaler()):
    columns = df.columns.tolist()
    index = df.index.tolist()
    stdz_df = scaler.fit_transform(df)
    stdz_df = pd.DataFrame(stdz_df, columns=columns, index=index)
    return stdz_df

### Get and 'split' data

The 'split' portion is currently a cheating version where I just create two new rows because I decided not to split it into train and test sets for this tiny dataset. But I want a placeholder so I don't forget to incorporate it in the next iteration when I start using bigger datasets.

In [4]:
planets = pd.read_excel('data/planets_moons.xlsx', index_col=0)
planets

Unnamed: 0_level_0,type,mass_1024kg,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,...,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mercury,planet,0.33,4879.0,5427,3.7,4.3,1407.6,4222.6,57.9,46.0,...,167,1e-14,0,No,Yes,2440.53,2439.4,-0.6,0.106,1000.0
Venus,planet,4.87,12104.0,5243,8.9,10.4,-5832.5,2802.0,108.2,107.5,...,464,92.0,0,No,No,6051.8,6051.8,-4.47,0.65,4.8e+20
Earth,planet,5.97,12756.0,5514,9.8,11.2,23.9,24.0,149.6,147.1,...,15,1.014,1,No,Yes,6378.1366,6371.0084,-3.86,0.367,1.4e+21
Moon,moon,0.073,3475.0,3340,1.6,2.4,655.7,708.7,149.6,147.1,...,-20,3e-15,0,No,No,1737.5,1737.4,-0.08,0.12,100000.0
Mars,planet,0.642,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,-65,0.01,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16
Jupiter,planet,1898.0,142984.0,1326,23.1,59.5,9.9,9.9,778.6,740.5,...,-110,2.0,79,Yes,Yes,71492.0,69911.0,-9.4,0.52,1.9e+27
Saturn,planet,568.0,120536.0,687,9.0,35.5,10.7,10.7,1433.5,1352.6,...,-140,1000.0,82,Yes,Yes,60268.0,58232.0,-8.88,0.47,5.4e+26
Titan,moon,0.126,5149.4,1882,1.4,2.6,382.0,382.0,1433.5,1352.6,...,-179,1.6,0,No,No,2574.7,2574.7,-8.1,0.21,9.1e+18
Uranus,planet,86.8,51118.0,1271,8.7,21.3,-17.2,17.2,2872.5,2741.3,...,-195,1000.0,27,Yes,Yes,25559.0,25362.0,-7.19,0.51,8.6e+25
Neptune,planet,102.0,49528.0,1638,11.0,23.5,16.1,16.1,4495.1,4444.5,...,-200,1000.0,14,Yes,Yes,24764.0,24622.0,-6.87,0.41,1e+26


In [5]:
# Creating the 'test' set that I'll be using to make a prediction

predict_on = planets.loc[['Mars'], :].drop('mass_1024kg', axis=1)
predict_on = pd.concat([predict_on]*2)
predict_on.index = ['Mars_low', 'Mars_high']
# Low end surface pressure based on conditions on Mt Everest
predict_on.loc['Mars_low', 'surface_pressure_bars'] = 0.47
# High end surface pressure based on nigrogen narcosis experienced by divers
predict_on.loc['Mars_high', 'surface_pressure_bars'] = 4
predict_on

Unnamed: 0,type,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,aphelion_106 km,...,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
Mars_low,planet,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,249.2,...,-65,0.47,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16
Mars_high,planet,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,249.2,...,-65,4.0,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16


### Separate the target value

In [6]:
X_train = planets.drop('mass_1024kg', axis=1)
y_train = planets['mass_1024kg']

### Determine collinearity and narrow features

In [7]:
df_top, feature_corr = feature_corr(X_train, y_train)
non_coll = feature_coll(feature_corr, df_top)
non_coll_features = non_coll.columns.tolist()

Collinear features: ['gravity_m_s2', 'diameter_km', 'atmospheric_mass_kg', 'mean_radius_km', 'equatorial_radius_km', 'nbr_moons']


NameError: name 'non_coll' is not defined

In [None]:
planets_num = convert_categoricals(planets)

planets_stdz = feature_stdz(select_df)
planets_stdz

### Missing values

Look for missing values. As seen below, there are none in this dataset (hand curated, I got rid of them all). Will need to address in the future.

In [6]:
planets_stdz.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, Mercury to Pluto
Data columns (total 13 columns):
atmospheric_mass_kg            11 non-null float64
diameter_km                    11 non-null float64
mass_1024kg                    11 non-null float64
V(1,0) (mag)                   11 non-null float64
day_len_hr                     11 non-null float64
surface_pressure_bars          11 non-null float64
rings                          11 non-null float64
geometric_albedo               11 non-null float64
density_kg_m3                  11 non-null float64
magnetic_field                 11 non-null float64
orbital_inclination_degrees    11 non-null float64
type                           11 non-null float64
rotation_period_hr             11 non-null float64
dtypes: float64(13)
memory usage: 1.2+ KB


## Train model

In [7]:
X = planets_stdz.drop('atmospheric_mass_kg', axis=1)
y = planets_stdz['atmospheric_mass_kg']
model = LinearRegression()

model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
model.coef_

array([ 0.14238478,  0.69706561,  0.0096957 ,  0.00509335, -0.32938854,
        0.31222586, -0.03180231, -0.0297368 ,  0.0237083 , -0.00429627,
       -0.00544434, -0.04911224])

In [9]:
model.score(X, y)

1.0

## Make Predictions

### Create test set

In [13]:
predict_on = select_df.loc[['Mars'], :]
predict_on = pd.concat([predict_on]*2)
# predict_on = predict_on.loc[np.repeat(predict_on.index, 2)]
predict_on.index = ['Mars_low', 'Mars_high']
predict_on.loc['Mars_low', 'surface_pressure_bars'] = 0.47
# Based on pressure on Mt Everest
predict_on.loc['Mars_high', 'surface_pressure_bars'] = 4
# Based on nigrogen narcosis experienced by divers
predict_on

Unnamed: 0,atmospheric_mass_kg,diameter_km,mass_1024kg,"V(1,0) (mag)",day_len_hr,surface_pressure_bars,rings,geometric_albedo,density_kg_m3,magnetic_field,orbital_inclination_degrees,type,rotation_period_hr
Mars_low,2.5e+16,6792.0,0.642,-1.52,24.7,0.47,0,0.15,3933,0,1.9,2,24.6
Mars_high,2.5e+16,6792.0,0.642,-1.52,24.7,4.0,0,0.15,3933,0,1.9,2,24.6
