### Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
# read in dataframe
df = pd.read_csv('../../data/added_data/kahuna.csv')

In [3]:
# set up X
X = df.drop(columns='Entity')

# set up with dummy variables
X_dum = pd.get_dummies(df, columns=['Entity'], drop_first=True)

# scale X
ss = StandardScaler()
X_sc = ss.fit_transform(X)



### DBSCAN

In [4]:
dbs = DBSCAN()
X_dbs = dbs.fit(X_sc) 

In [5]:
silhouette_score(X_sc, X_dbs.labels_)

0.04930086807658926

In [6]:
def find_best_silhouette(df):
    
    max_score = -1
    ss = StandardScaler()
    df_scaled = ss.fit_transform(df)
    
    for epsilon in np.linspace(.2, 5, 30):
        for minsamples in range(2, 10):
            dbscan = DBSCAN(eps=epsilon, min_samples=minsamples, n_jobs=-1)
            dbscan.fit(df_scaled)
            if len(set(dbscan.labels_))>1:
                score = silhouette_score(df_scaled, dbscan.labels_)
                if score > max_score:
                    max_score = score
                    best_eps = epsilon
                    best_minsamples = minsamples
                    
    print(f'Best silhouette score was {max_score}')
    print(f'Best eps was {best_eps}')
    print(f'Best min_samples was {best_minsamples}')
    return

# h/t Eric 

In [7]:
find_best_silhouette(X)

Best silhouette score was 0.6102005009632955
Best eps was 4.83448275862069
Best min_samples was 7


In [8]:
X['clusters'] = dbs.labels_

### DBS w/Entity

In [9]:
dbs = DBSCAN()
X_dbs = dbs.fit(X_sc) 

In [10]:
silhouette_score(X_sc, X_dbs.labels_)

0.04930086807658926

In [11]:
def find_best_silhouette(df):
    
    max_score = -1
    ss = StandardScaler()
    df_scaled = ss.fit_transform(df)
    
    for epsilon in np.linspace(.2, 5, 30):
        for minsamples in range(2, 10):
            dbscan = DBSCAN(eps=epsilon, min_samples=minsamples, n_jobs=-1)
            dbscan.fit(df_scaled)
            if len(set(dbscan.labels_))>1:
                score = silhouette_score(df_scaled, dbscan.labels_)
                if score > max_score:
                    max_score = score
                    best_eps = epsilon
                    best_minsamples = minsamples
                    
    print(f'Best silhouette score was {max_score}')
    print(f'Best eps was {best_eps}')
    print(f'Best min_samples was {best_minsamples}')
    return

# h/t Eric 

In [12]:
find_best_silhouette(X)

Best silhouette score was 0.592541144352764
Best eps was 4.83448275862069
Best min_samples was 7


In [13]:
X['clusters'] = dbs.labels_

In [14]:
X.shape

(2409, 26)

In [15]:
X['clusters'].value_counts()

-1     449
 29     73
 46     59
 49     49
 0      40
      ... 
 36      5
 43      5
 41      5
 16      5
 61      5
Name: clusters, Length: 125, dtype: int64

In [16]:
len(X['clusters'].value_counts())

125

In [17]:
X

Unnamed: 0,Year,Deaths %,CO2 Emissions,CO2 Emissions Per Cap,Food Emissions,Total GHG,Consumption of Ozone,Shared CO2 Emissions,Transport,Death_rate_ambient_ozone_pol,...,Death_5-14,Death_50-69,Death_70+,Death_15-49,Urban%,Child Mortality,Population,GDP,Forest area,clusters
0,1991,17.613636,4228256,1.2889,4011691.0,9030000.0,0.00,0.02,500000.0,4.360865,...,5.975427,271.661092,512.126572,60.714220,36.700,3.93,119.225912,3496.580246,786850.0,-1
1,1995,16.789055,2066496,0.6638,4861321.1,7480000.0,43.40,0.01,600000.0,3.420271,...,4.580403,240.740486,469.406670,49.692924,38.911,3.39,116.342482,4471.871070,779050.0,0
2,1996,16.588451,1993216,0.6432,4569702.9,7280000.0,43.20,0.01,600000.0,3.079414,...,4.373679,255.560407,491.357876,52.069689,39.473,3.26,115.621642,4909.228105,777100.0,0
3,1997,15.081361,1531552,0.4941,4431656.7,6510000.0,45.20,0.01,500000.0,3.023884,...,4.414006,269.448518,521.197486,57.988035,40.035,3.13,114.900766,4400.577827,775150.0,0
4,1998,15.965796,1744064,0.5607,4344061.6,6810000.0,49.60,0.01,700000.0,2.600204,...,4.493629,280.472022,547.315668,64.973478,40.601,2.99,114.179927,4819.387534,773200.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2404,2010,9.450169,7878316,0.6205,55244889.2,66540000.0,29.30,0.02,1300000.0,1.052172,...,13.582136,735.443275,684.000846,253.211851,33.196,8.76,32.823389,2458.220626,17905280.0,123
2405,2011,10.053401,9743983,0.7557,53985614.6,67420000.0,22.22,0.03,2300000.0,1.224476,...,15.022895,750.211216,687.415400,252.443341,33.015,8.14,33.331583,2764.338916,17859210.0,123
2406,2012,10.455091,7882889,0.6011,54211027.7,67630000.0,16.20,0.02,2600000.0,1.425288,...,16.043625,783.503715,711.735517,258.678699,32.834,7.38,33.902414,3170.726522,17813140.0,123
2407,2013,10.691584,11836254,0.8866,54321218.5,67550000.0,15.76,0.03,2900000.0,1.491373,...,15.101173,824.426468,740.153224,266.910947,32.654,6.78,34.510477,3176.829330,17767070.0,123


In [18]:
X.groupby('clusters').mean().T

clusters,-1,0,1,2,3,4,5,6,7,8,...,114,115,116,117,118,119,120,121,122,123
Year,2001.653,2004.0,2002.957,2002.0,2004.5,2000.0,1992.0,2004.5,2001.0,2003.5,...,2001.5,2003.5,2005.0,1996.0,2002.5,2004.0,2004.0,1995.5,2011.0,2003.0
Deaths %,12.59396,13.49183,11.00485,4.78448,12.99573,1.628815,6.24737,12.29549,20.64377,9.207737,...,10.1819,10.92785,8.488323,13.77986,3.115833,15.44146,15.50159,9.900545,11.02352,7.997639
CO2 Emissions,814017100.0,4547692.0,101880800.0,151925100.0,4145868.0,348450200.0,61961990.0,32052830.0,31174100.0,60631830.0,...,241889600.0,45935310.0,309247700.0,73183530.0,5917661.0,115893700.0,88231480.0,2177756.0,3634941.0,11754580.0
CO2 Emissions Per Cap,10.04409,1.227175,3.117365,4.008026,1.39251,18.09986,7.91222,3.755056,0.2353474,6.252959,...,3.685037,9.604032,6.547912,28.80786,1.78845,4.447395,1.034667,0.23845,0.2559429,0.9842667
Food Emissions,420151900.0,5271523.0,38171940.0,331715200.0,2684291.0,265159300.0,22515670.0,13477700.0,149545600.0,55718870.0,...,89074000.0,18894800.0,89081290.0,9544114.0,32483110.0,47365050.0,123372600.0,407499800.0,428711600.0,54218420.0
Total GHG,1123673000.0,9846500.0,140027000.0,404022600.0,7356500.0,583004300.0,64146000.0,63031670.0,140496300.0,70563180.0,...,257660000.0,81950450.0,393755900.0,95128570.0,15386670.0,215648400.0,154099000.0,445511700.0,448101400.0,67302860.0
Consumption of Ozone,9414.056,34.8575,1124.284,3696.615,84.458,1893.329,1450.66,92.35056,417.8105,197.8173,...,2242.615,28.56909,433.1282,754.3,147.2892,47.40316,420.4605,46.29167,7.06,444.2895
Shared CO2 Emissions,2.939621,0.01475,0.3604348,0.5473913,0.0125,1.317143,0.272,0.1116667,0.1142105,0.2177273,...,0.8641667,0.1577273,1.072941,0.3085714,0.02125,0.4115789,0.29,0.01,0.01,0.04333333
Transport,141163300.0,1760000.0,23856520.0,39895650.0,750000.0,73761900.0,14860000.0,3972222.0,4031579.0,8595455.0,...,37558330.0,7345455.0,32205880.0,13071430.0,2529167.0,9089474.0,18223810.0,766666.7,785714.3,1823810.0
Death_rate_ambient_ozone_pol,4.951276,1.515195,2.677674,0.7548685,3.431414,0.1966187,1.394745,1.86525,10.26687,1.541341,...,4.243424,1.827053,2.027388,6.599798,0.5055387,3.417824,1.8225,1.630838,2.147589,0.9077106


In [19]:
449/2409

0.18638439186384392

In [93]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

In [101]:
df.columns

Index(['Entity', 'Year', 'Deaths %', 'CO2 Emissions', 'CO2 Emissions Per Cap',
       'Food Emissions', 'Total GHG', 'Consumption of Ozone',
       'Shared CO2 Emissions', 'Transport', 'Death_rate_ambient_ozone_pol',
       'Death_rate_household_air_pol', 'Death_rate_all_causes',
       'Death_actual_ozone', 'Death_actual_particulate',
       'Death_actual_household', 'Death_under5', 'Death_5-14', 'Death_50-69',
       'Death_70+', 'Death_15-49', 'Urban%', 'Child Mortality', 'Population',
       'GDP', 'Forest area'],
      dtype='object')

In [90]:
X = df.drop(columns=['Year', 'Forest area', 'Entity'])
y = df['Entity']

In [82]:
from sklearn.preprocessing import LabelEncoder

In [83]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [84]:
list(le.classes_)
y = le.transform(y)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [86]:
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [87]:
lr = LinearRegression()

lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.16155425478382035, 0.14744168937799929)

In [88]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 

In [92]:
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)
rfc.score(X_train, y_train), rfc.score(X_test, y_test)

(1.0, 1.0)

In [95]:
logreg = LogisticRegression(random_state=24, max_iter=1000)

logreg.fit(X_train_sc, y_train)
logreg.score(X_train_sc, y_train), logreg.score(X_test_sc, y_test)

(0.96843853820598, 0.9485903814262023)

In [97]:
import pickle
with open('../../models/logreg.pkl', 'wb') as pickle_out:
    pickle.dump(logreg, pickle_out)