### Imports

In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [28]:
# read in dataframe
df = pd.read_csv('../data/cleaned_data/kahuna.csv')

### KMeans Preprocessing

In [29]:
# set X and y
X = df.drop(columns=['Entity', 'Year'])
y = df['Entity']

# scale X
ss = StandardScaler()
X_sc = ss.fit_transform(X)

In [30]:
# adding k=3 cluster column
km = KMeans(n_clusters=3)
km.fit(X)
kahuna['clusterx3'] = km.labels_

In [31]:
# adding k=10 cluster column
km = KMeans(n_clusters=10)
km.fit(X)
kahuna['clusterx10'] = km.labels_

### Classification (Entity)

In [32]:
X = df.drop(columns=['Entity', 'Year'])
y = df['Entity']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

#### RFC

In [33]:
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train_sc, y_train)
rfc.score(X_train_sc, y_train), rfc.score(X_test_sc, y_test)

(1.0, 1.0)

#### Logistic Regression

In [34]:
logreg = LogisticRegression(random_state=42, max_iter=300)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.9778516057585825, 0.9701492537313433)

**Note**: Models do an excellent job of classifying each country--country has a distinct enought history

### Classification (Year)

In [35]:
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

#### RFC

In [36]:
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train_sc, y_train)
rfc.score(X_train_sc, y_train), rfc.score(X_test_sc, y_test)

(1.0, 0.003316749585406302)

#### Logistic Regression

In [37]:
logreg = LogisticRegression(random_state=42, max_iter=300)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.09080841638981174, 0.03482587064676617)

### Linear Regression

In [38]:
X = df.drop(columns=['Entity', 'Year'])
y = df['Year']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [39]:
lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.15193533628262434, 0.16291335497072834)

**Note**: Years are not distinct classified or regressed

#### All features

In [65]:
# model for each feature
lr_list = []
for each in kahuna.drop(columns=['Entity', 'Year', 'clusterx3', 'clusterx10']).columns:
    X = kahuna.drop(columns=['Entity', 'Year', each])
    y = kahuna[each]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    
    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr.fit(X_train_sc, y_train)
    n = [each, lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)]
    lr_list.append(n)

# table for models    
lr_table = pd.DataFrame(lr_list, columns=['Predictor', 'Train', 'Test'])

In [68]:
lr_table.sort_values(by='Test', ascending=False)

Unnamed: 0,Predictor,Train,Test
12,Death_actual_particulate,1.0,1.0
14,Death_under5,1.0,1.0
18,Death_15-49,1.0,1.0
17,Death_70+,1.0,1.0
16,Death_50-69,1.0,1.0
15,Death_5-14,1.0,1.0
11,Death_actual_ozone,1.0,1.0
4,Total GHG,0.99411,0.995465
1,CO2 Emissions,0.996874,0.994284
6,Shared CO2 Emissions,0.989207,0.990835


In [74]:
# read in cancer dataframe
cancer = pd.read_csv('../data/dirty_data/cancer-incidence.csv')

# quick clean of cancer dataframe
cancer = cancer.drop(columns=['Code'])
cancer = cancer[cancer['Year']<=2014]
cancer.rename(columns={'Incidence - Neoplasms - Sex: Both - Age: Age-standardized (Rate)': 'Cancer_incidence'}, inplace=True)

# merge cancer and kahuna
kahuna_cancer = kahuna.merge(cancer, on=['Entity', 'Year'])

In [76]:
# Prediction for cancer incidence
X = kahuna_cancer.drop(columns=['Entity', 'Year', 'Cancer_incidence'])
y = kahuna_cancer['Cancer_incidence']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.6762242020431626, 0.6767854203724735)