# Entry 10 notebook - Reorder Pre-processing and Make Prediction - Atmospheric Mass

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [2]:
def feature_corr_coll(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    df = df_top.drop(collinear_features, axis=1)
    
    select_features = df.columns.tolist()
    
    return select_features, df

In [3]:
def preprocess_data(train_df, test_df, select_features, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = train_df.index.tolist()
    test_index = test_df.index.tolist()
    
    num_features = train_df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(train_df[num_features])
    train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    test_num_scale = scaler.transform(test_df[num_features])
    test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
    cat_features = train_df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(train_df[cat_features])
    train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    test_cat_encode = cat_encoder.transform(test_df[cat_features])
    test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
    train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
    test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
    return train_df, test_df

### Get and 'split' data

The 'split' portion is currently a cheating version where I just create two new rows because I decided not to split it into train and test sets for this tiny dataset. But I want a placeholder so I don't forget to incorporate it in the next iteration when I start using bigger datasets.

In [4]:
planets = pd.read_excel('../data/planets_moons.xlsx', index_col=0)
# planets

In [5]:
# Creating the 'test' set that I'll be using to make a prediction

X_test = planets.loc[['Mars'], :].drop('atmospheric_mass_kg', axis=1)
X_test = pd.concat([X_test]*2)
X_test.index = ['Mars_low', 'Mars_high']
# Low end surface pressure based on conditions on Mt Everest
X_test.loc['Mars_low', 'surface_pressure_bars'] = 0.47
# High end surface pressure based on nigrogen narcosis experienced by divers
X_test.loc['Mars_high', 'surface_pressure_bars'] = 4
X_test

Unnamed: 0,type,mass_1024kg,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,...,obliquity_to_orbit_degrees,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo
Mars_low,planet,0.642,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,25.2,-65,0.47,2,No,No,3396.19,3389.5,-1.52,0.15
Mars_high,planet,0.642,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,25.2,-65,4.0,2,No,No,3396.19,3389.5,-1.52,0.15


### Separate the target value

In [6]:
X_train = planets.drop('atmospheric_mass_kg', axis=1)
y_train = planets['atmospheric_mass_kg']

### Determine collinearity and narrow features

In [7]:
select_features, non_coll = feature_corr_coll(X_train, y_train)
non_coll

Unnamed: 0_level_0,diameter_km,day_len_hr,"V(1,0) (mag)",surface_pressure_bars,geometric_albedo,density_kg_m3,orbital_inclination_degrees,rotation_period_hr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mercury,4879.0,4222.6,-0.6,1e-14,0.106,5427,7.0,1407.6
Venus,12104.0,2802.0,-4.47,92.0,0.65,5243,3.4,-5832.5
Earth,12756.0,24.0,-3.86,1.014,0.367,5514,0.0,23.9
Moon,3475.0,708.7,-0.08,3e-15,0.12,3340,5.1,655.7
Mars,6792.0,24.7,-1.52,0.01,0.15,3933,1.9,24.6
Jupiter,142984.0,9.9,-9.4,2.0,0.52,1326,1.3,9.9
Saturn,120536.0,10.7,-8.88,1000.0,0.47,687,2.5,10.7
Titan,5149.4,382.0,-8.1,1.6,0.21,1882,0.3,382.0
Uranus,51118.0,17.2,-7.19,1000.0,0.51,1271,0.8,-17.2
Neptune,49528.0,16.1,-6.87,1000.0,0.41,1638,1.8,16.1


### Preprocess train and test

In [8]:
planets_stdz, test_stdz = preprocess_data(non_coll, X_test, select_features)
planets_stdz

Unnamed: 0,diameter_km,day_len_hr,"V(1,0) (mag)",surface_pressure_bars,geometric_albedo,density_kg_m3,orbital_inclination_degrees,rotation_period_hr
Mercury,-0.681977,2.56817,1.224812,-0.638729,-1.374401,1.429504,0.690697,0.959204
Venus,-0.53059,1.514217,0.075589,-0.429988,1.732669,1.323681,-0.075454,-3.070742
Earth,-0.516928,-0.546799,0.256733,-0.636428,0.116307,1.47954,-0.799041,0.189016
Moon,-0.711396,-0.038815,1.37923,-0.638729,-1.29444,0.229214,0.286339,0.540685
Mars,-0.641894,-0.546279,0.951612,-0.638706,-1.123094,0.570264,-0.394684,0.189406
Jupiter,2.211778,-0.557259,-1.388408,-0.634191,0.990171,-0.929091,-0.522376,0.181224
Saturn,1.741418,-0.556666,-1.233991,1.6302,0.704595,-1.296597,-0.266992,0.181669
Titan,-0.676311,-0.281196,-1.002364,-0.635099,-0.780402,-0.609321,-0.735195,0.38834
Uranus,0.286882,-0.551843,-0.732134,1.6302,0.933056,-0.960723,-0.628786,0.166139
Neptune,0.253567,-0.55266,-0.637107,1.6302,0.361903,-0.749652,-0.415966,0.184675


In [9]:
test_stdz

Unnamed: 0,diameter_km,day_len_hr,"V(1,0) (mag)",surface_pressure_bars,geometric_albedo,density_kg_m3,orbital_inclination_degrees,rotation_period_hr
Mars_low,-0.641894,-0.546279,0.951612,-0.637663,-1.123094,0.570264,-0.394684,0.189406
Mars_high,-0.641894,-0.546279,0.951612,-0.629653,-1.123094,0.570264,-0.394684,0.189406


In [10]:
y_train

name
Mercury    1.000000e+03
Venus      4.800000e+20
Earth      1.400000e+21
Moon       1.000000e+05
Mars       2.500000e+16
Jupiter    1.900000e+27
Saturn     5.400000e+26
Titan      9.100000e+18
Uranus     8.600000e+25
Neptune    1.000000e+26
Pluto      1.300000e+14
Name: atmospheric_mass_kg, dtype: float64

## Train model

In [11]:
model = LinearRegression()
model.fit(planets_stdz, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
model.coef_

array([ 4.65129803e+26,  1.04857854e+26,  2.15501708e+26, -4.09378543e+26,
        3.04332425e+26, -2.64312434e+26, -1.45384152e+26,  1.71555552e+26])

In [13]:
model.score(planets_stdz, y_train)

0.9748029733394393

## Make Predictions

In [14]:
mass_pred = model.predict(test_stdz)
mass_pred

array([-5.36462064e+25, -5.69250491e+25])