# Entry 10 notebook - Reorder Pre-processing and Make Prediction - Mass

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [2]:
def feature_corr_coll(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    df = df_top.drop(collinear_features, axis=1)
    
    select_features = df.columns.tolist()
    
    return select_features, df

In [3]:
def train_preprocess(df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = df.index.tolist()
    
    num_features = df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(df[num_features])
    num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    cat_features = df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(df[cat_features])
    cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    df = pd.concat([num_df, cat_df], axis=1).fillna(-1)
    
    return df, scaler, cat_encoder

In [4]:
def test_preprocess(df, select_features, scaler, cat_encoder):
    df = df[select_features]
    index = df.index.tolist()
    
    num_features = df.select_dtypes('number').columns.tolist()
    num_scale = scaler.transform(df[num_features])
    num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    cat_features = df.select_dtypes('object').columns.tolist()
    cat_encode = cat_encoder.transform(df[cat_features])
    cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    df = pd.concat([num_df, cat_df], axis=1).fillna(-1)    
    return df

### Get and 'split' data

The 'split' portion is currently a cheating version where I just create two new rows because I decided not to split it into train and test sets for this tiny dataset. But I want a placeholder so I don't forget to incorporate it in the next iteration when I start using bigger datasets.

In [5]:
planets = pd.read_excel('../data/planets_moons.xlsx', index_col=0)
planets

Unnamed: 0_level_0,type,mass_1024kg,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,...,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mercury,planet,0.33,4879.0,5427,3.7,4.3,1407.6,4222.6,57.9,46.0,...,167,1e-14,0,No,Yes,2440.53,2439.4,-0.6,0.106,1000.0
Venus,planet,4.87,12104.0,5243,8.9,10.4,-5832.5,2802.0,108.2,107.5,...,464,92.0,0,No,No,6051.8,6051.8,-4.47,0.65,4.8e+20
Earth,planet,5.97,12756.0,5514,9.8,11.2,23.9,24.0,149.6,147.1,...,15,1.014,1,No,Yes,6378.1366,6371.0084,-3.86,0.367,1.4e+21
Moon,moon,0.073,3475.0,3340,1.6,2.4,655.7,708.7,149.6,147.1,...,-20,3e-15,0,No,No,1737.5,1737.4,-0.08,0.12,100000.0
Mars,planet,0.642,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,-65,0.01,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16
Jupiter,planet,1898.0,142984.0,1326,23.1,59.5,9.9,9.9,778.6,740.5,...,-110,2.0,79,Yes,Yes,71492.0,69911.0,-9.4,0.52,1.9e+27
Saturn,planet,568.0,120536.0,687,9.0,35.5,10.7,10.7,1433.5,1352.6,...,-140,1000.0,82,Yes,Yes,60268.0,58232.0,-8.88,0.47,5.4e+26
Titan,moon,0.126,5149.4,1882,1.4,2.6,382.0,382.0,1433.5,1352.6,...,-179,1.6,0,No,No,2574.7,2574.7,-8.1,0.21,9.1e+18
Uranus,planet,86.8,51118.0,1271,8.7,21.3,-17.2,17.2,2872.5,2741.3,...,-195,1000.0,27,Yes,Yes,25559.0,25362.0,-7.19,0.51,8.6e+25
Neptune,planet,102.0,49528.0,1638,11.0,23.5,16.1,16.1,4495.1,4444.5,...,-200,1000.0,14,Yes,Yes,24764.0,24622.0,-6.87,0.41,1e+26


In [7]:
# Creating the 'test' set that I'll be using to make a prediction

X_test = planets.loc[['Mars'], :].drop('mass_1024kg', axis=1)
X_test = pd.concat([X_test]*2)
X_test.index = ['Mars_low', 'Mars_high']
# Low end surface pressure based on conditions on Mt Everest
X_test.loc['Mars_low', 'surface_pressure_bars'] = 0.47
# High end surface pressure based on nigrogen narcosis experienced by divers
X_test.loc['Mars_high', 'surface_pressure_bars'] = 4
X_test

Unnamed: 0,type,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,aphelion_106 km,...,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
Mars_low,planet,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,249.2,...,-65,0.47,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16
Mars_high,planet,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,249.2,...,-65,4.0,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16


### Separate the target value

In [8]:
X_train = planets.drop('mass_1024kg', axis=1)
y_train = planets['mass_1024kg']

### Determine collinearity and narrow features

In [9]:
select_features, non_coll = feature_corr_coll(X_train, y_train)
non_coll

Unnamed: 0_level_0,escape_vel_km_s,day_len_hr,surface_pressure_bars,"V(1,0) (mag)",geometric_albedo,density_kg_m3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mercury,4.3,4222.6,1e-14,-0.6,0.106,5427
Venus,10.4,2802.0,92.0,-4.47,0.65,5243
Earth,11.2,24.0,1.014,-3.86,0.367,5514
Moon,2.4,708.7,3e-15,-0.08,0.12,3340
Mars,5.0,24.7,0.01,-1.52,0.15,3933
Jupiter,59.5,9.9,2.0,-9.4,0.52,1326
Saturn,35.5,10.7,1000.0,-8.88,0.47,687
Titan,2.6,382.0,1.6,-8.1,0.21,1882
Uranus,21.3,17.2,1000.0,-7.19,0.51,1271
Neptune,23.5,16.1,1000.0,-6.87,0.41,1638


In [10]:
planets_stdz, scaler, cat_encoder = train_preprocess(non_coll)
planets_stdz

Unnamed: 0,escape_vel_km_s,day_len_hr,surface_pressure_bars,"V(1,0) (mag)",geometric_albedo,density_kg_m3
Mercury,-0.686002,2.56817,-0.638729,1.224812,-1.374401,1.429504
Venus,-0.3311,1.514217,-0.429988,0.075589,1.732669,1.323681
Earth,-0.284556,-0.546799,-0.636428,0.256733,0.116307,1.47954
Moon,-0.796545,-0.038815,-0.638729,1.37923,-1.29444,0.229214
Mars,-0.645276,-0.546279,-0.638706,0.951612,-1.123094,0.570264
Jupiter,2.525566,-0.557259,-0.634191,-1.388408,0.990171,-0.929091
Saturn,1.129232,-0.556666,1.6302,-1.233991,0.704595,-1.296597
Titan,-0.784909,-0.281196,-0.635099,-1.002364,-0.780402,-0.609321
Uranus,0.303068,-0.551843,1.6302,-0.732134,0.933056,-0.960723
Neptune,0.431065,-0.55266,1.6302,-0.637107,0.361903,-0.749652


In [15]:
y_train

name
Mercury       0.3300
Venus         4.8700
Earth         5.9700
Moon          0.0730
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Titan         0.1260
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
Name: mass_1024kg, dtype: float64

## Train model

In [11]:
model = LinearRegression()
model.fit(planets_stdz, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [41]:
model.coef_

array([ 578.7745756 ,   45.3732546 , -256.66857878,   42.3611936 ,
        -21.23685918, -144.45464598])

In [42]:
model.score(planets_stdz, y_train)

0.9996082335061778

## Make Predictions

In [13]:
test_stdz = test_preprocess(X_test, select_features, scaler, cat_encoder)
test_stdz

Unnamed: 0,escape_vel_km_s,day_len_hr,surface_pressure_bars,"V(1,0) (mag)",geometric_albedo,density_kg_m3
Mars_low,-0.645276,-0.546279,-0.637663,0.951612,-1.123094,0.570264
Mars_high,-0.645276,-0.546279,-0.629653,0.951612,-1.123094,0.570264


In [14]:
mass_pred = model.predict(test_stdz)
mass_pred

array([-10.3639144 , -12.41965455])