In [1]:
# Imports
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# reading in final dataframe
df = pd.read_csv('../data/cleaned_data/kahuna.csv')

-------------------------

## KMeans Clustering Preprocessing

In [3]:
X = df.drop(columns=['Entity', 'Year'])
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [4]:
km_2 = KMeans(n_clusters=2, random_state=42)
km_2.fit(X_sc)

KMeans(n_clusters=2, random_state=42)

In [5]:
df['clusterx2'] = km_2.labels_

In [6]:
km_4 = KMeans(n_clusters=4, random_state=42)
km_4.fit(X_sc)

KMeans(n_clusters=4, random_state=42)

In [7]:
df['clusterx4'] = km_4.labels_

In [8]:
df.to_csv('../data/cleaned_data/kahuna_clusters.csv')

-------------------------

### Modeling Classification (Entity, Year)

In [9]:
# drop Entity and Year column
X = df.drop(columns=['Entity', 'Year'])

# set y = entity and label encode
y = df['Entity']
le = LabelEncoder()
y = le.fit_transform(y)

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)

In [10]:
# scale data
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

#### Logistic Regression

In [11]:
# model for entity
logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.969545957918051, 0.9601990049751243)

In [12]:
# model for year
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.09191583610188261, 0.05638474295190713)

### RFC

In [13]:
# model for entity 
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.001658374792703151)

In [14]:
# model for year
y = df['Year']
le = LabelEncoder()
y = le.fit_transform(y)

rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.001658374792703151)

### Linear Regression for all features

In [15]:
# linear model for year
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.15427894203502657, 0.16626836037207637)

In [16]:
lr_list = []
for each in df.drop(columns=['Entity', 'Year']).columns:
    X = df.drop(columns=['Entity', 'Year', each])
    y = df[each]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # scale data
    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr.fit(X_train_sc, y_train)
    n = [each, lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)]
    lr_list.append(n)
    print(f'lr of {each} = {lr.score(X_train_sc, y_train)}, {lr.score(X_test_sc, y_test)}')

lr of Deaths % = 0.8478493248767202, 0.8438292736590294
lr of CO2 Emissions = 0.9968284002763721, 0.9942443862826758
lr of CO2 Emissions Per Cap = 0.7523298444827954, 0.7455938765298069
lr of Food Emissions = 0.9399067613574666, 0.9326619048382266
lr of Total GHG = 0.9941656129588595, 0.9954734013354128
lr of Consumption of Ozone = 0.6037682556220387, 0.5774918543609984
lr of Shared CO2 Emissions = 0.9893337023827512, 0.9908065181656995
lr of Transport = 0.9860563651112924, 0.9665303986983664
lr of Death_rate_ambient_ozone_pol = 0.5772885812808108, 0.6055922170966381
lr of Death_rate_household_air_pol = 0.9233304643692063, 0.930303557511951
lr of Death_rate_all_causes = 0.7369886517958553, 0.7121029496611306
lr of Death_actual_ozone = 1.0, 1.0
lr of Death_actual_particulate = 1.0, 1.0
lr of Death_actual_household = 0.9715809090521157, 0.9780723294744573
lr of Death_under5 = 1.0, 1.0
lr of Death_5-14 = 1.0, 1.0
lr of Death_50-69 = 1.0, 1.0
lr of Death_70+ = 1.0, 1.0
lr of Death_15-49 = 

In [17]:
lr_table = pd.DataFrame(lr_list, columns=['Predictor', 'Train', 'Test'])

In [18]:
lr_table.T.drop(columns=[11, 12, 14, 15, 16, 17, 18]).T.sort_values(by='Test', ascending=False)

Unnamed: 0,Predictor,Train,Test
4,Total GHG,0.994166,0.995473
1,CO2 Emissions,0.996828,0.994244
6,Shared CO2 Emissions,0.989334,0.990807
24,clusterx2,0.979883,0.981801
13,Death_actual_household,0.971581,0.978072
7,Transport,0.986056,0.96653
3,Food Emissions,0.939907,0.932662
9,Death_rate_household_air_pol,0.92333,0.930304
0,Deaths %,0.847849,0.843829
20,Child Mortality,0.829716,0.83485


In [19]:
cancer = pd.read_csv('../data/cleaned_data/cancer-incidence.csv')

In [20]:
cancer.rename(columns={'Incidence - Neoplasms - Sex: Both - Age: Age-standardized (Rate)' : 'Cancer_Incidence'}, inplace=True)

In [21]:
cancer = cancer.drop(columns='Code')

In [22]:
cancer = cancer[cancer['Year']<=2014]

In [23]:
df = df.merge(cancer, on=['Entity', 'Year'])

In [24]:
df.to_csv('../data/cleaned_data/kahuna_clusters_cancer')