In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [3]:
source = 'planets_moons.xlsx'
target = 'mass_1024kg'

def load_data(source, sheet_name=0, index_col=0, target=target):
    if source.endswith('.xlsx'):
        df = pd.read_excel(source, sheet_name, index_col=index_col)
    else:
        df = pd.read_csv(source, index_col=index_col)
    df_target = df[[target]]
    df_features = df.drop(target, axis=1)
    df_features = df_features[['type', 'density_kg_m3', 'gravity_m_s2',
       'escape_vel_km_s', 'rotation_period_hr', 'day_len_hr',
       'distance_from_sun_106_km', 'orbital_period_days', 'orbital_velocity_km_s',
       'orbital_inclination_degrees', 'orbital_eccentricity',
       'obliquity_to_orbit_degrees', 'mean_temp_c', 'surface_pressure_bars',
       'nbr_moons', 'rings', 'magnetic_field', 'V(1,0) (mag)', 'geometric_albedo']]
    return df_features, df_target

raw_features, raw_target = load_data(source)
raw_features

Unnamed: 0_level_0,type,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,orbital_period_days,orbital_velocity_km_s,orbital_inclination_degrees,orbital_eccentricity,obliquity_to_orbit_degrees,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,"V(1,0) (mag)",geometric_albedo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Mercury,planet,5427,3.7,4.3,1407.6,4222.6,57.9,88.0,47.4,7.0,0.205,0.034,167,1e-14,0,No,Yes,-0.6,0.106
Venus,planet,5243,8.9,10.4,-5832.5,2802.0,108.2,224.7,35.0,3.4,0.007,177.4,464,92.0,0,No,No,-4.47,0.65
Earth,planet,5514,9.8,11.2,23.9,24.0,149.6,365.2,29.8,0.0,0.017,23.4,15,1.014,1,No,Yes,-3.86,0.367
Moon,moon,3340,1.6,2.4,655.7,708.7,149.6,27.3,1.0,5.1,0.055,6.7,-20,3e-15,0,No,No,-0.08,0.12
Mars,planet,3933,3.7,5.0,24.6,24.7,227.9,687.0,24.1,1.9,0.094,25.2,-65,0.01,2,No,No,-1.52,0.15
Jupiter,planet,1326,23.1,59.5,9.9,9.9,778.6,4331.0,13.1,1.3,0.049,3.1,-110,2.0,79,Yes,Yes,-9.4,0.52
Saturn,planet,687,9.0,35.5,10.7,10.7,1433.5,10747.0,9.7,2.5,0.057,26.7,-140,1000.0,82,Yes,Yes,-8.88,0.47
Titan,moon,1882,1.4,2.6,382.0,382.0,1433.5,16.0,5.6,0.3,0.029,0.32,-179,1.6,0,No,No,-8.1,0.21
Uranus,planet,1271,8.7,21.3,-17.2,17.2,2872.5,30589.0,6.8,0.8,0.046,97.8,-195,1000.0,27,Yes,Yes,-7.19,0.51
Neptune,planet,1638,11.0,23.5,16.1,16.1,4495.1,59800.0,5.4,1.8,0.011,28.3,-200,1000.0,14,Yes,Yes,-6.87,0.41


In [11]:
raw_features.select_dtypes(np.number).columns

Index(['density_kg_m3', 'gravity_m_s2', 'escape_vel_km_s',
       'rotation_period_hr', 'day_len_hr', 'distance_from_sun_106_km',
       'orbital_period_days', 'orbital_velocity_km_s',
       'orbital_inclination_degrees', 'orbital_eccentricity',
       'obliquity_to_orbit_degrees', 'mean_temp_c', 'surface_pressure_bars',
       'nbr_moons', 'V(1,0) (mag)', 'geometric_albedo'],
      dtype='object')

In [6]:
def first_func(feature_df, target_df, encoder=ce.OrdinalEncoder, target=target, corr_type='spearman'):
    binary_features = list(set(feature_df.select_dtypes('object').columns.tolist()) & set(feature_df.nunique()[feature_df.nunique() <= 2].index.tolist()))
    binary_encoder = encoder(cols=binary_features)
    cat_features = list(set(feature_df.select_dtypes('object').columns.tolist()) & set(feature_df.nunique()[feature_df.nunique() > 2].index.tolist()))
    cat_encoder = encoder(cols=cat_features)
    feature_df = binary_encoder.fit_transform(feature_df)
    feature_df = cat_encoder.fit_transform(feature_df)
    numerical_features = feature_df.select_dtypes(np.number).columns
    categorical_features = feature_df.select_dtypes('category').columns
    
    # change this so you get the correlations of the numberical features only
    target_corr = pd.DataFrame(feature_df.corrwith(raw_target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type}).sort_values(corr_type, ascending=False)
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr.sort_values('abs', ascending=False).iloc[:21,:].sort_values(corr_type, ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return target_corr, df_top, feature_corr
    return df

def feature_corr(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(df[target], axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type}).sort_values(corr_type, ascending=False)
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr.sort_values('abs', ascending=False).iloc[:21,:].sort_values(corr_type, ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return target_corr, df_top, feature_corr

target_corr, df_top, feature_corr = feature_corr(planets, 'mass_1024kg')

In [8]:
trial = first_func(raw_features)
trial

Unnamed: 0_level_0,type,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,orbital_period_days,orbital_velocity_km_s,orbital_inclination_degrees,orbital_eccentricity,obliquity_to_orbit_degrees,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,"V(1,0) (mag)",geometric_albedo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Mercury,planet,5427,3.7,4.3,1407.6,4222.6,57.9,88.0,47.4,7.0,0.205,0.034,167,1e-14,0,1,1,-0.6,0.106
Venus,planet,5243,8.9,10.4,-5832.5,2802.0,108.2,224.7,35.0,3.4,0.007,177.4,464,92.0,0,1,2,-4.47,0.65
Earth,planet,5514,9.8,11.2,23.9,24.0,149.6,365.2,29.8,0.0,0.017,23.4,15,1.014,1,1,1,-3.86,0.367
Moon,moon,3340,1.6,2.4,655.7,708.7,149.6,27.3,1.0,5.1,0.055,6.7,-20,3e-15,0,1,2,-0.08,0.12
Mars,planet,3933,3.7,5.0,24.6,24.7,227.9,687.0,24.1,1.9,0.094,25.2,-65,0.01,2,1,2,-1.52,0.15
Jupiter,planet,1326,23.1,59.5,9.9,9.9,778.6,4331.0,13.1,1.3,0.049,3.1,-110,2.0,79,2,1,-9.4,0.52
Saturn,planet,687,9.0,35.5,10.7,10.7,1433.5,10747.0,9.7,2.5,0.057,26.7,-140,1000.0,82,2,1,-8.88,0.47
Titan,moon,1882,1.4,2.6,382.0,382.0,1433.5,16.0,5.6,0.3,0.029,0.32,-179,1.6,0,1,2,-8.1,0.21
Uranus,planet,1271,8.7,21.3,-17.2,17.2,2872.5,30589.0,6.8,0.8,0.046,97.8,-195,1000.0,27,2,1,-7.19,0.51
Neptune,planet,1638,11.0,23.5,16.1,16.1,4495.1,59800.0,5.4,1.8,0.011,28.3,-200,1000.0,14,2,1,-6.87,0.41


In [None]:
def preprocess_data(df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    cat_features = df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    df = cat_encoder.fit_transform(df)
    columns = df.columns.tolist()
    index = df.index.tolist()
    df = scaler.fit_transform(df)
    df = pd.DataFrame(df, columns=columns, index=index)
    return scaler, cat_encoder, df

stdz_scaler, cat_encoder, stdz_df = preprocess_data(raw_df)
planets = stdz_df.loc[:, 'type':'atmospheric_mass_kg']
planets

In [None]:
def feature_corr(df, target, corr_type='spearman'):
    target_corr = pd.DataFrame(df.corrwith(df[target], axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type}).sort_values(corr_type, ascending=False)
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr.sort_values('abs', ascending=False).iloc[:21,:].sort_values(corr_type, ascending=False)['index'].tolist()
    df_top = df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    return target_corr, df_top, feature_corr

target_corr, df_top, feature_corr = feature_corr(planets, 'mass_1024kg')