In [257]:
# Imports
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [258]:
# reading in final dataframe
df = pd.read_csv('../data/cleaned_data/kahuna.csv')

-------------------------

## KMeans Clustering Preprocessing

In [3]:
X = df.drop(columns=['Entity', 'Year'])
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [4]:
km_2 = KMeans(n_clusters=2, random_state=42)
km_2.fit(X_sc)

KMeans(n_clusters=2, random_state=42)

In [5]:
df['clusterx2'] = km_2.labels_

In [6]:
km_4 = KMeans(n_clusters=4, random_state=42)
km_4.fit(X_sc)

KMeans(n_clusters=4, random_state=42)

In [7]:
df['clusterx4'] = km_4.labels_

In [8]:
df.to_csv('../data/cleaned_data/kahuna_clusters.csv')

-------------------------

### Modeling Classification (Entity, Year)

In [9]:
# drop Entity and Year column
X = df.drop(columns=['Entity', 'Year'])

# set y = entity and label encode
y = df['Entity']
le = LabelEncoder()
y = le.fit_transform(y)

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)

In [10]:
# scale data
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

#### Logistic Regression

In [11]:
# model for entity
logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logiregr = ['logreg', logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)]
class_list.append(logiregr)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.969545957918051, 0.9601990049751243)

In [12]:
# model for year
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.09191583610188261, 0.05638474295190713)

### RFC

In [13]:
# model for entity 
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
ranfor = ['rfc', rfc.score(X_train, y_train), rfc.score(X_test, y_test)]
class_list.append(ranfor)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.001658374792703151)

In [14]:
# model for year
y = df['Year']
le = LabelEncoder()
y = le.fit_transform(y)

rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.001658374792703151)

In [251]:
class_table = pd.DataFrame(class_list, columns=['Model', 'Train', 'Test'])

In [254]:
class_table

Unnamed: 0,Model,Train,Test
0,logreg,0.976744,0.970149
1,rfc,1.0,0.998342


### Linear Regression for all features

In [15]:
# linear model for year
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.15427894203502657, 0.16626836037207637)

In [16]:
lr_list = []
for each in df.drop(columns=['Entity', 'Year']).columns:
    X = df.drop(columns=['Entity', 'Year', each])
    y = df[each]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # scale data
    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr.fit(X_train_sc, y_train)
    n = [each, lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)]
    lr_list.append(n)
    print(f'lr of {each} = {lr.score(X_train_sc, y_train)}, {lr.score(X_test_sc, y_test)}')

lr of Deaths % = 0.8478493248767202, 0.8438292736590294
lr of CO2 Emissions = 0.9968284002763721, 0.9942443862826758
lr of CO2 Emissions Per Cap = 0.7523298444827954, 0.7455938765298069
lr of Food Emissions = 0.9399067613574666, 0.9326619048382266
lr of Total GHG = 0.9941656129588595, 0.9954734013354128
lr of Consumption of Ozone = 0.6037682556220387, 0.5774918543609984
lr of Shared CO2 Emissions = 0.9893337023827512, 0.9908065181656995
lr of Transport = 0.9860563651112924, 0.9665303986983664
lr of Death_rate_ambient_ozone_pol = 0.5772885812808108, 0.6055922170966381
lr of Death_rate_household_air_pol = 0.9233304643692063, 0.930303557511951
lr of Death_rate_all_causes = 0.7369886517958553, 0.7121029496611306
lr of Death_actual_ozone = 1.0, 1.0
lr of Death_actual_particulate = 1.0, 1.0
lr of Death_actual_household = 0.9715809090521157, 0.9780723294744573
lr of Death_under5 = 1.0, 1.0
lr of Death_5-14 = 1.0, 1.0
lr of Death_50-69 = 1.0, 1.0
lr of Death_70+ = 1.0, 1.0
lr of Death_15-49 = 

In [17]:
lr_table = pd.DataFrame(lr_list, columns=['Predictor', 'Train', 'Test'])

In [18]:
lr_table.T.drop(columns=[11, 12, 14, 15, 16, 17, 18]).T.sort_values(by='Test', ascending=False)

Unnamed: 0,Predictor,Train,Test
4,Total GHG,0.994166,0.995473
1,CO2 Emissions,0.996828,0.994244
6,Shared CO2 Emissions,0.989334,0.990807
24,clusterx2,0.979883,0.981801
13,Death_actual_household,0.971581,0.978072
7,Transport,0.986056,0.96653
3,Food Emissions,0.939907,0.932662
9,Death_rate_household_air_pol,0.92333,0.930304
0,Deaths %,0.847849,0.843829
20,Child Mortality,0.829716,0.83485


In [19]:
cancer = pd.read_csv('../data/cleaned_data/cancer-incidence.csv')

In [20]:
cancer.rename(columns={'Incidence - Neoplasms - Sex: Both - Age: Age-standardized (Rate)' : 'Cancer_Incidence'}, inplace=True)

In [21]:
cancer = cancer.drop(columns='Code')

In [22]:
cancer = cancer[cancer['Year']<=2014]

In [23]:
df = df.merge(cancer, on=['Entity', 'Year'])

In [24]:
df.to_csv('../data/cleaned_data/kahuna_clusters_cancer')

In [207]:
for each in kahuna.drop(columns=['Entity', 'Year', 'clusterx2', 'clusterx4', 'Death_rate_ambient_ozone_pol']).columns:
    X = kahuna.drop(columns=['Entity', 'Year', 'clusterx2', 'clusterx4', each])
    y = kahuna[each]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # scale data
    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr_each.fit(X_train_sc, y_train)
    print(f'lr of {each} = {lr_each.score(X_train_sc, y_train)}, {lr_each.score(X_test_sc, y_test)}')

lr of Deaths % = 0.8716562191520321, 0.8567322448758011
lr of CO2 Emissions = 0.9968081264652657, 0.9945468498821548
lr of CO2 Emissions Per Cap = 0.754423805500311, 0.7469525334750731
lr of Food Emissions = 0.9407673782125073, 0.9316752211978517
lr of Total GHG = 0.9942444251942821, 0.9955435649402853
lr of Consumption of Ozone = 0.6124460486741606, 0.5395352564406828
lr of Shared CO2 Emissions = 0.9891531003527557, 0.9906501377849518
lr of Transport = 0.9890096438713155, 0.978879303040661
lr of Death_rate_household_air_pol = 0.9291674192039528, 0.9337958437122305
lr of Death_rate_all_causes = 0.7377652887033654, 0.709095667782081
lr of Death_actual_ozone = 1.0, 1.0
lr of Death_actual_particulate = 1.0, 1.0
lr of Death_actual_household = 0.969916848966614, 0.9762501015911581
lr of Death_under5 = 1.0, 1.0
lr of Death_5-14 = 1.0, 1.0
lr of Death_50-69 = 1.0, 1.0
lr of Death_70+ = 1.0, 1.0
lr of Death_15-49 = 1.0, 1.0
lr of Urban% = 0.7437850075570294, 0.729705080735542
lr of Child Morta

In [255]:
kahuna

Unnamed: 0,Entity,Year,Deaths %,CO2 Emissions,CO2 Emissions Per Cap,Food Emissions,Total GHG,Consumption of Ozone,Shared CO2 Emissions,Transport,...,Death_70+,Death_15-49,Urban%,Child Mortality,Population,GDP,Forest area,clusterx2,clusterx4,Cancer_Incidence
0,Albania,1991,17.613636,4228256,1.2889,4011691.0,9030000.0,0.00,0.02,500000.0,...,512.126572,60.714220,36.700,3.93,119.225912,3496.580246,786850.0,0,2,190.046659
1,Albania,1995,16.789055,2066496,0.6638,4861321.1,7480000.0,43.40,0.01,600000.0,...,469.406670,49.692924,38.911,3.39,116.342482,4471.871070,779050.0,0,2,183.178817
2,Albania,1996,16.588451,1993216,0.6432,4569702.9,7280000.0,43.20,0.01,600000.0,...,491.357876,52.069689,39.473,3.26,115.621642,4909.228105,777100.0,0,2,187.578978
3,Albania,1997,15.081361,1531552,0.4941,4431656.7,6510000.0,45.20,0.01,500000.0,...,521.197486,57.988035,40.035,3.13,114.900766,4400.577827,775150.0,0,2,191.441640
4,Albania,1998,15.965796,1744064,0.5607,4344061.6,6810000.0,49.60,0.01,700000.0,...,547.315668,64.973478,40.601,2.99,114.179927,4819.387534,773200.0,0,2,195.033589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2404,Zimbabwe,2010,9.450169,7878316,0.6205,55244889.2,66540000.0,29.30,0.02,1300000.0,...,684.000846,253.211851,33.196,8.76,32.823389,2458.220626,17905280.0,0,2,224.459639
2405,Zimbabwe,2011,10.053401,9743983,0.7557,53985614.6,67420000.0,22.22,0.03,2300000.0,...,687.415400,252.443341,33.015,8.14,33.331583,2764.338916,17859210.0,0,2,216.792521
2406,Zimbabwe,2012,10.455091,7882889,0.6011,54211027.7,67630000.0,16.20,0.02,2600000.0,...,711.735517,258.678699,32.834,7.38,33.902414,3170.726522,17813140.0,0,2,210.917010
2407,Zimbabwe,2013,10.691584,11836254,0.8866,54321218.5,67550000.0,15.76,0.03,2900000.0,...,740.153224,266.910947,32.654,6.78,34.510477,3176.829330,17767070.0,0,2,206.449353


In [256]:
kahuna.columns

Index(['Entity', 'Year', 'Deaths %', 'CO2 Emissions', 'CO2 Emissions Per Cap',
       'Food Emissions', 'Total GHG', 'Consumption of Ozone',
       'Shared CO2 Emissions', 'Transport', 'Death_rate_ambient_ozone_pol',
       'Death_rate_household_air_pol', 'Death_rate_all_causes',
       'Death_actual_ozone', 'Death_actual_particulate',
       'Death_actual_household', 'Death_under5', 'Death_5-14', 'Death_50-69',
       'Death_70+', 'Death_15-49', 'Urban%', 'Child Mortality', 'Population',
       'GDP', 'Forest area', 'clusterx2', 'clusterx4', 'Cancer_Incidence'],
      dtype='object')