# Entry 6 notebook - EDA correlation notebook

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

### Load data

In [2]:
planets = pd.read_excel('../data/planets_moons.xlsx', index_col=0)
planets

Unnamed: 0_level_0,type,mass_1024kg,diameter_km,density_kg_m3,gravity_m_s2,escape_vel_km_s,rotation_period_hr,day_len_hr,distance_from_sun_106_km,perihelion_106 km,...,mean_temp_c,surface_pressure_bars,nbr_moons,rings,magnetic_field,equatorial_radius_km,mean_radius_km,"V(1,0) (mag)",geometric_albedo,atmospheric_mass_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mercury,planet,0.33,4879.0,5427,3.7,4.3,1407.6,4222.6,57.9,46.0,...,167,1e-14,0,No,Yes,2440.53,2439.4,-0.6,0.106,1000.0
Venus,planet,4.87,12104.0,5243,8.9,10.4,-5832.5,2802.0,108.2,107.5,...,464,92.0,0,No,No,6051.8,6051.8,-4.47,0.65,4.8e+20
Earth,planet,5.97,12756.0,5514,9.8,11.2,23.9,24.0,149.6,147.1,...,15,1.014,1,No,Yes,6378.1366,6371.0084,-3.86,0.367,1.4e+21
Moon,moon,0.073,3475.0,3340,1.6,2.4,655.7,708.7,149.6,147.1,...,-20,3e-15,0,No,No,1737.5,1737.4,-0.08,0.12,100000.0
Mars,planet,0.642,6792.0,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,-65,0.01,2,No,No,3396.19,3389.5,-1.52,0.15,2.5e+16
Jupiter,planet,1898.0,142984.0,1326,23.1,59.5,9.9,9.9,778.6,740.5,...,-110,2.0,79,Yes,Yes,71492.0,69911.0,-9.4,0.52,1.9e+27
Saturn,planet,568.0,120536.0,687,9.0,35.5,10.7,10.7,1433.5,1352.6,...,-140,1000.0,82,Yes,Yes,60268.0,58232.0,-8.88,0.47,5.4e+26
Titan,moon,0.126,5149.4,1882,1.4,2.6,382.0,382.0,1433.5,1352.6,...,-179,1.6,0,No,No,2574.7,2574.7,-8.1,0.21,9.1e+18
Uranus,planet,86.8,51118.0,1271,8.7,21.3,-17.2,17.2,2872.5,2741.3,...,-195,1000.0,27,Yes,Yes,25559.0,25362.0,-7.19,0.51,8.6e+25
Neptune,planet,102.0,49528.0,1638,11.0,23.5,16.1,16.1,4495.1,4444.5,...,-200,1000.0,14,Yes,Yes,24764.0,24622.0,-6.87,0.41,1e+26


### Locate categorical variables

In [4]:
planets.select_dtypes('object')

Unnamed: 0_level_0,type,rings,magnetic_field
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mercury,planet,No,Yes
Venus,planet,No,No
Earth,planet,No,Yes
Moon,moon,No,No
Mars,planet,No,No
Jupiter,planet,Yes,Yes
Saturn,planet,Yes,Yes
Titan,moon,No,No
Uranus,planet,Yes,Yes
Neptune,planet,Yes,Yes


### Convert categoricals

In [5]:
planets[['type', 'rings', 'magnetic_field']] = planets[['type', 'rings', 'magnetic_field']].astype('category')

In [6]:
for cat in planets.select_dtypes('category').columns.tolist():
    planets[cat] = planets[cat].cat.codes

In [7]:
planets[['type', 'magnetic_field', 'rings']].head()

Unnamed: 0_level_0,type,magnetic_field,rings
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mercury,2,1,0
Venus,2,0,0
Earth,2,1,0
Moon,1,0,0
Mars,2,0,0


### Run correlations

In [26]:
pearson_feature_corr = pd.DataFrame(planets.corrwith(planets['atmospheric_mass_kg'], axis=0).reset_index()).rename(columns={0:'pearson_cor'})

In [27]:
spearman_feature_corr = pd.DataFrame(planets.corrwith(planets['atmospheric_mass_kg'], axis=0, method='spearman').reset_index()).rename(columns={0:'spearman_cor'})

### Combine and sort

Combined the pearson and spearman results to compare them.

Created a column with the absolute value of the spearman result. This allowed me to sort by the absolute value of the Spearman value. I don't care if it's a positive or negative relationship, just how strong the relationship is.

In [11]:
feature_corr = pearson_feature_corr.merge(spearman_feature_corr, on='index', how='inner')
feature_corr['abs_spearman'] = feature_corr['spearman_cor'].abs()
feature_corr.sort_values('abs_spearman', ascending=False)

Unnamed: 0,index,pearson_cor,spearman_cor,abs_spearman
25,atmospheric_mass_kg,1.0,1.0,1.0
2,diameter_km,0.862912,0.945455,0.945455
22,mean_radius_km,0.864138,0.945455,0.945455
21,equatorial_radius_km,0.862915,0.945455,0.945455
5,escape_vel_km_s,0.910232,0.918182,0.918182
1,mass_1024kg,0.999894,0.918182,0.918182
7,day_len_hr,-0.242968,-0.881818,0.881818
23,"V(1,0) (mag)",-0.570149,-0.881818,0.881818
17,surface_pressure_bars,-0.003563,0.853247,0.853247
19,rings,0.577273,0.83666,0.83666


### Correlation method compairson

The different quantative values for correlation are interesting.

Spearman ranked diameter_km, mean_radius_km, and equatorial_radius_km all equally correlated to atmospheric_mass_kg, which would be expected as they all measure the same thing. Spearman also gave them more importance than Pearson.

Pearson ranked mass_1024kg more important than Spearman did, which would be good for my theory of chaning the mass will all the planet to hold onto more atmosphere. But having discussed the issue with a friend, I'm wondering if atmospheric mass is included in planetary mass. If this were a real world problem where the result mattered, I'd look into it. However, since this is a spur for a fictional story, I'm going to ignore it.

Spearman ranked escape_vel_km_s the same as mass_1024kg, and put both at about the same importance as diameter/radius. I would have expected gravity_m_s2 and density to be in with this feature set as mass, density, diameter, and gravity are [all mathematically related](https://earthobservatory.nasa.gov/features/GRACE/page2.php) and escape velocity is a direct result of gravity.

Based on the prevalence that magnetic field holds in the literature, I expected magnetic field to have a stronger correlation.

Several of the lower correlations have opposite relationships depending on which method is used. For example, orbital_period_days and distance_from_sun_106_km.

## Narrow features

Sabber and I originally decided 20 features was a good starting place. After the results of the visualization exercises in Entry 5 (where I ended up with 22 visualizations for 5-8 features), I decided to narrow this a little further. My cut off is now an absolute Spearman value greater than or equal to 0.5.

I also ordered the columns by most important to least important features (as related to atmospheric_mass_kg).

In [25]:
features = feature_corr[feature_corr['abs_spearman'] >= 0.5].sort_values('abs_spearman', ascending=False)['index'].tolist()
planets[features]

Unnamed: 0_level_0,atmospheric_mass_kg,diameter_km,mean_radius_km,equatorial_radius_km,mass_1024kg,escape_vel_km_s,"V(1,0) (mag)",day_len_hr,surface_pressure_bars,rings,gravity_m_s2,geometric_albedo,nbr_moons,density_kg_m3,magnetic_field,orbital_inclination_degrees,type,rotation_period_hr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Mercury,1000.0,4879.0,2439.4,2440.53,0.33,4.3,-0.6,4222.6,1e-14,0,3.7,0.106,0,5427,1,7.0,2,1407.6
Venus,4.8e+20,12104.0,6051.8,6051.8,4.87,10.4,-4.47,2802.0,92.0,0,8.9,0.65,0,5243,0,3.4,2,-5832.5
Earth,1.4e+21,12756.0,6371.0084,6378.1366,5.97,11.2,-3.86,24.0,1.014,0,9.8,0.367,1,5514,1,0.0,2,23.9
Moon,100000.0,3475.0,1737.4,1737.5,0.073,2.4,-0.08,708.7,3e-15,0,1.6,0.12,0,3340,0,5.1,1,655.7
Mars,2.5e+16,6792.0,3389.5,3396.19,0.642,5.0,-1.52,24.7,0.01,0,3.7,0.15,2,3933,0,1.9,2,24.6
Jupiter,1.9e+27,142984.0,69911.0,71492.0,1898.0,59.5,-9.4,9.9,2.0,1,23.1,0.52,79,1326,1,1.3,2,9.9
Saturn,5.4e+26,120536.0,58232.0,60268.0,568.0,35.5,-8.88,10.7,1000.0,1,9.0,0.47,82,687,1,2.5,2,10.7
Titan,9.1e+18,5149.4,2574.7,2574.7,0.126,2.6,-8.1,382.0,1.6,0,1.4,0.21,0,1882,0,0.3,1,382.0
Uranus,8.6e+25,51118.0,25362.0,25559.0,86.8,21.3,-7.19,17.2,1000.0,1,8.7,0.51,27,1271,1,0.8,2,-17.2
Neptune,1e+26,49528.0,24622.0,24764.0,102.0,23.5,-6.87,16.1,1000.0,1,11.0,0.41,14,1638,1,1.8,2,16.1
