In [1]:
import sys
import os
import subprocess
import datetime
import pandas as pd
import numpy as np
import pygeoprocessing
import matplotlib.pyplot as plt
from osgeo import gdal

## Relationship analysis - correlations, plots, breakdowns
### Dataframe cleaning & preprocessing

In [None]:
# Load in previously saved dataframes
carbon_access_merged = pd.read_csv(carbon_access_csv_path)
carbon_influence_merged = pd.read_csv(carbon_influence_csv_path)
carbon_population_merged = pd.read_csv(carbon_population_csv_path)

In [None]:
carbon_access_merged.head()

Clean up null or incoherent values.

In [None]:
print("Number of records before cleaning:", len(carbon_access_merged))

carbon_access_merged.drop(
    carbon_access_merged[carbon_access_merged['value'] < 0].index, 
    inplace=True)

carbon_access_merged.drop(
    carbon_access_merged[carbon_access_merged['carbon'] < 50].index, 
    inplace=True)

carbon_access_merged.head()

print("Number of records after cleaning:", len(carbon_access_merged))

Sanity check - are there any places where the merged dataframes do not align? Do coherent values exist?

In [None]:
# Check for rows where latitudes do not match
diff = carbon_access_merged.loc[
    (carbon_access_merged['long_carbon'].astype(int) != 
     carbon_access_merged['long_mins'].astype(int)) &
    (carbon_access_merged['lat_carbon'].astype(int) != 
     carbon_access_merged['lat_mins'].astype(int))]
print(len(diff))

# Check that non-zero values exist
real_vals = carbon_access_merged.loc[
    (carbon_access_merged['carbon'] > 0) &
    (carbon_access_merged['value'] > 0)]
real_vals.head()

### Looking for correlations

In [None]:
carbon_access_merged

Is the correlation between covariates better than random?

In [None]:
# Is the correlation better than random?
carbon_access_merged['rand'] = np.random.random(carbon_access_merged.shape[0])
column_corr = carbon_access_merged['carbon'].corr(carbon_access_merged['value'])
random_corr_carbon = carbon_access_merged['carbon'].corr(carbon_access_merged['rand'])
random_corr_value = carbon_access_merged['value'].corr(carbon_access_merged['rand'])
print("Correlation between carbon and population density:", column_corr, 
      "\n Correlation between carbon and random:", random_corr_carbon, 
      "\n Correlation between population density and random:", random_corr_value)

In [None]:
# Print correlation matrix
import seaborn as sns
corr = carbon_access_merged.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

## Plotting covariate relationships

In [None]:
import matplotlib.pyplot as plt
plt.scatter(np.log(carbon_access_merged['carbon']), 
            carbon_access_merged['value'], 
            s=1)
plt.title("Carbon and minutes to market")
plt.ylabel("Minutes to market")
plt.xlabel("ABG biomass")
plt.figsize=(20,20)
plt.show()

In [None]:
import seaborn as sns
print_start_time("Plotting KDE...")
g = sns.jointplot("carbon", "value", data=carbon_access_merged, kind="kde", space=0, color="g")