# Entry 9 notebook - Train a Model

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

### Custom functions

In [2]:
def convert_categoricals(df):
    cat_vars = df.select_dtypes('object').columns.tolist()
    converted = df.copy()
    converted[cat_vars] = df[cat_vars].astype('category')
    for cat in cat_vars:
        converted[cat] = converted[cat].cat.codes
    return converted

In [3]:
def feature_corr(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(df[target], axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return df_top, feature_corr

In [4]:
def feature_stdz(df, scaler = StandardScaler()):
    columns = df.columns.tolist()
    index = df.index.tolist()
    stdz_df = scaler.fit_transform(df)
    stdz_df = pd.DataFrame(stdz_df, columns=columns, index=index)
    return stdz_df

### Get data and preprocess

In [5]:
planets = pd.read_excel('../data/planets_moons.xlsx', index_col=0)
planets_num = convert_categoricals(planets)
df_top, feature_corr = feature_corr(planets_num, 'atmospheric_mass_kg')

select_features = ['atmospheric_mass_kg', 'diameter_km', 'mass_1024kg', 'V(1,0) (mag)', 'day_len_hr', 'surface_pressure_bars', 'rings', 'geometric_albedo', 'density_kg_m3', 'magnetic_field', 'orbital_inclination_degrees', 'type', 'rotation_period_hr']
select_df = df_top[select_features]

planets_stdz = feature_stdz(select_df)
planets_stdz

Unnamed: 0,atmospheric_mass_kg,diameter_km,mass_1024kg,"V(1,0) (mag)",day_len_hr,surface_pressure_bars,rings,geometric_albedo,density_kg_m3,magnetic_field,orbital_inclination_degrees,type,rotation_period_hr
Mercury,-0.436378,-0.681977,-0.442264,1.224812,2.56817,-0.638729,-0.755929,-1.374401,1.429504,0.912871,0.690697,0.565685,0.959204
Venus,-0.436377,-0.53059,-0.43397,0.075589,1.514217,-0.429988,-0.755929,1.732669,1.323681,-1.095445,-0.075454,0.565685,-3.070742
Earth,-0.436375,-0.516928,-0.431961,0.256733,-0.546799,-0.636428,-0.755929,0.116307,1.47954,0.912871,-0.799041,0.565685,0.189016
Moon,-0.436378,-0.711396,-0.442733,1.37923,-0.038815,-0.638729,-0.755929,-1.29444,0.229214,-1.095445,0.286339,-0.989949,0.540685
Mars,-0.436378,-0.641894,-0.441694,0.951612,-0.546279,-0.638706,-0.755929,-1.123094,0.570264,-1.095445,-0.394684,0.565685,0.189406
Jupiter,3.036693,2.211778,3.024239,-1.388408,-0.557259,-0.634191,1.322876,0.990171,-0.929091,0.912871,-0.522376,0.565685,0.181224
Saturn,0.550706,1.741418,0.594708,-1.233991,-0.556666,1.6302,1.322876,0.704595,-1.296597,0.912871,-0.266992,0.565685,0.181669
Titan,-0.436377,-0.676311,-0.442636,-1.002364,-0.281196,-0.635099,-0.755929,-0.780402,-0.609321,-1.095445,-0.735195,-0.989949,0.38834
Uranus,-0.279175,0.286882,-0.284308,-0.732134,-0.551843,1.6302,1.322876,0.933056,-0.960723,0.912871,-0.628786,0.565685,0.166139
Neptune,-0.253584,0.253567,-0.256541,-0.637107,-0.55266,1.6302,1.322876,0.361903,-0.749652,0.912871,-0.415966,0.565685,0.184675


### Missing values

Look for missing values. As seen below, there are none in this dataset (hand curated, I got rid of them all). Will need to address in the future.

In [6]:
planets_stdz.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, Mercury to Pluto
Data columns (total 13 columns):
atmospheric_mass_kg            11 non-null float64
diameter_km                    11 non-null float64
mass_1024kg                    11 non-null float64
V(1,0) (mag)                   11 non-null float64
day_len_hr                     11 non-null float64
surface_pressure_bars          11 non-null float64
rings                          11 non-null float64
geometric_albedo               11 non-null float64
density_kg_m3                  11 non-null float64
magnetic_field                 11 non-null float64
orbital_inclination_degrees    11 non-null float64
type                           11 non-null float64
rotation_period_hr             11 non-null float64
dtypes: float64(13)
memory usage: 1.2+ KB


## Train model

In [7]:
X = planets_stdz.drop('mass_1024kg', axis=1)
y = planets_stdz['mass_1024kg']
model = LinearRegression()

model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
model.coef_

array([ 0.14238478,  0.69706561,  0.0096957 ,  0.00509335, -0.32938854,
        0.31222586, -0.03180231, -0.0297368 ,  0.0237083 , -0.00429627,
       -0.00544434, -0.04911224])

In [9]:
model.score(X, y)

1.0

## Make Predictions

### Create test set

In [13]:
predict_on = select_df.loc[['Mars'], :]
predict_on = pd.concat([predict_on]*2)
# predict_on = predict_on.loc[np.repeat(predict_on.index, 2)]
predict_on.index = ['Mars_low', 'Mars_high']
predict_on.loc['Mars_low', 'surface_pressure_bars'] = 0.47
# Based on pressure on Mt Everest
predict_on.loc['Mars_high', 'surface_pressure_bars'] = 4
# Based on nigrogen narcosis experienced by divers
predict_on

Unnamed: 0,atmospheric_mass_kg,diameter_km,mass_1024kg,"V(1,0) (mag)",day_len_hr,surface_pressure_bars,rings,geometric_albedo,density_kg_m3,magnetic_field,orbital_inclination_degrees,type,rotation_period_hr
Mars_low,2.5e+16,6792.0,0.642,-1.52,24.7,0.47,0,0.15,3933,0,1.9,2,24.6
Mars_high,2.5e+16,6792.0,0.642,-1.52,24.7,4.0,0,0.15,3933,0,1.9,2,24.6


#### Stymied

No way to apply same scaling - the standard deviation of these two values would be different than that of the full dataset.