In [31]:
# Imports
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# reading in final dataframe
kahuna = pd.read_csv('../data/added_data/kahuna.csv')

-------------------------

## KMeans Clustering Preprocessing

In [3]:
X = kahuna.drop(columns=['Entity', 'Year'])
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [4]:
km_3 = KMeans(n_clusters=3, random_state=42)
km_3.fit(X_sc)

KMeans(n_clusters=3, random_state=42)

In [5]:
kahuna['clusterx3'] = km_3.labels_

In [6]:
km_10 = KMeans(n_clusters=10, random_state=42)
km_10.fit(X_sc)

KMeans(n_clusters=10, random_state=42)

In [7]:
kahuna['clusterx10'] = km_10.labels_

In [137]:
kahuna.to_csv('../data/new_cleaned/kahuna_clusters.csv')

-------------------------

### Modeling Classification (Entity, Year)

In [64]:
features_no_clust = kahuna.drop(columns=['clusterx3', 'clusterx10'])
features_clustx3 = kahuna.drop(columns=['clusterx10'])
features_clustx10 =  kahuna.drop(columns=['clusterx3'])
features_all = kahuna
features_list = [features_no_clust, features_clustx3, features_clustx10, features_all]

In [81]:
# drop Entity and Year column
X = kahuna.drop(columns=['Entity', 'Year'])

# set y = entity and label encode
y = kahuna['Entity']
le = LabelEncoder()
y = le.fit_transform(y)

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)

In [85]:
# scale data
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

#### Logistic Regression

In [83]:
# model for entity
logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.9706533776301218, 0.9502487562189055)

In [86]:
# model for year
X = kahuna.drop(columns=['Entity', 'Year'])
y = kahuna['Year']
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

logreg = LogisticRegression(random_state=42, max_iter=500)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.09966777408637874, 0.05472636815920398)

#### RFC

In [84]:
# model for entity 
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.9983416252072969)

In [87]:
# model for year
y = kahuna['Year']
le = LabelEncoder()
y = le.fit_transform(y)

rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 0.004975124378109453)

### Linear Regression for all features

In [133]:
# linear model for year
X = kahuna.drop(columns=['Entity', 'Year'])
y = kahuna['Year']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.1536240340629449, 0.16799949610069154)

In [123]:
lr_table = pd.DataFrame(columns=['Predictor', 'Train', 'Test'])
for each in kahuna.drop(columns=['Entity', 'Year']).columns:
    X = kahuna.drop(columns=['Entity', 'Year', each])
    y = kahuna[each]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # scale data
    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr.fit(X_train_sc, y_train)
    n = [{'Predictor': each, 'Train': lr.score(X_train_sc, y_train) , 'Test': lr.score(X_test_sc, y_test)}]
    lr_table.append(n)
    print(f'lr of {each} = {lr.score(X_train_sc, y_train)}, {lr.score(X_test_sc, y_test)}')

lr of Deaths % = 0.8523962833549201, 0.8521541354463075
lr of CO2 Emissions = 0.9969166327693822, 0.9946994888629878
lr of CO2 Emissions Per Cap = 0.7600606721522465, 0.7460238003898974
lr of Food Emissions = 0.9411246514398702, 0.934143555325361
lr of Total GHG = 0.994218779393425, 0.995570559535171
lr of Consumption of Ozone = 0.6204672315432372, 0.5273364752747816
lr of Shared CO2 Emissions = 0.989121368588543, 0.9907535757339402
lr of Transport = 0.9914862787391939, 0.9843195435630134
lr of Death_rate_ambient_ozone_pol = 0.5846909430655097, 0.6136773967985694
lr of Death_rate_household_air_pol = 0.9265187773591518, 0.926602603755438
lr of Death_rate_all_causes = 0.739499900834181, 0.7243395646254711
lr of Death_actual_ozone = 1.0, 1.0
lr of Death_actual_particulate = 1.0, 1.0
lr of Death_actual_household = 0.9693547615755085, 0.974721157175885
lr of Death_under5 = 1.0, 1.0
lr of Death_5-14 = 1.0, 1.0
lr of Death_50-69 = 1.0, 1.0
lr of Death_70+ = 1.0, 1.0
lr of Death_15-49 = 1.0, 1

In [130]:
lr_table

Unnamed: 0,Predictor,Train,Test


In [125]:
lr_table

Unnamed: 0,Predictor,Train,Test


In [126]:
lr_table.append([{'Predictor': 'something', 'Train': 1 , 'Test': 2}], ignore_index=True)

Unnamed: 0,Predictor,Train,Test
0,something,1,2


In [96]:
for each in kahuna.drop(columns=['Entity', 'Year', 'clusterx3', 'clusterx10']).columns:
    X = kahuna.drop(columns=['Entity', 'Year', 'clusterx3', 'clusterx10', each])
    y = kahuna[each]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # scale data
    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lr = LinearRegression()
    
    lr_each.fit(X_train_sc, y_train)
    print(f'lr of {each} = {lr_each.score(X_train_sc, y_train)}, {lr_each.score(X_test_sc, y_test)}')

lr of Deaths % = 0.8392082210294383, 0.842218306409522
lr of CO2 Emissions = 0.9967503027134128, 0.994255329433339
lr of CO2 Emissions Per Cap = 0.7515585164761336, 0.7425661262645378
lr of Food Emissions = 0.939873177951413, 0.9305604840067169
lr of Total GHG = 0.9940766039692084, 0.9954800997849924
lr of Consumption of Ozone = 0.6035635842570424, 0.5746695755341524
lr of Shared CO2 Emissions = 0.9891215591542588, 0.9907569742619967
lr of Transport = 0.9855188011173645, 0.9662653119607584
lr of Death_rate_ambient_ozone_pol = 0.5528228422709381, 0.5929409611578019
lr of Death_rate_household_air_pol = 0.9121877572968468, 0.9177223566220489
lr of Death_rate_all_causes = 0.728804171661086, 0.7143583896180135
lr of Death_actual_ozone = 1.0, 1.0
lr of Death_actual_particulate = 1.0, 1.0
lr of Death_actual_household = 0.9690915780534284, 0.9741851718418292
lr of Death_under5 = 1.0, 1.0
lr of Death_5-14 = 1.0, 1.0
lr of Death_50-69 = 1.0, 1.0
lr of Death_70+ = 1.0, 1.0
lr of Death_15-49 = 1.0