In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [29]:
df = pd.read_csv("/Users/jlh/global-health-map/data/separated_gender.csv")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Location,Period,Gender,Life_Expectancy,Healthy_Life_Expectancy,%Mrt30_70_crdo_cncr_dbts_rsprtry,Suicide100000,Ttl_alc_percapita_15+,Posioning100000,Smoking15+
0,Albania,2015,Male,76.1,67.99,20.9,7.6,10.92,0.48,51.4
1,Albania,2015,Female,79.69,70.12,13.7,4.4,2.53,0.37,8.3
2,Albania,2010,Male,74.23,66.34,22.3,9.5,12.47,0.59,53.2
3,Albania,2010,Female,78.31,68.85,14.6,6.1,2.98,0.44,9.1
4,Albania,2000,Male,70.65,63.15,23.1,7.4,10.6,1.29,57.4


In [30]:
X = df.drop(columns=['Location', 'Period', 'Gender'])
y = df["Gender"]
print(X.shape, y.shape)

(860, 7) (860,)


In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
label_encoder.fit(y_test)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [33]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
hot_y_train = to_categorical(encoded_y_train)
hot_y_test = to_categorical(encoded_y_test)
hot_y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [34]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
rf = RandomForestClassifier(n_estimators=150)
rf = rf.fit(X_train, hot_y_train)
rf.score(X_test, hot_y_test)

0.9627906976744186

In [36]:
feature_names = X.columns

In [37]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.2901248060091213, 'Smoking15+'),
 (0.26051598199584186, 'Ttl_alc_percapita_15+'),
 (0.12761130343458044, 'Suicide100000'),
 (0.10914567967546203, 'Life_Expectancy'),
 (0.0846026196594382, '%Mrt30_70_crdo_cncr_dbts_rsprtry'),
 (0.06414910394545865, 'Healthy_Life_Expectancy'),
 (0.0638505052800975, 'Posioning100000')]

In [38]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [75, 100, 30, 150],
    'max_features': ['auto', 'sqrt', 'log2']
}
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
CV_rf.fit(X_train, hot_y_train)
print(CV_rf.best_params_)
print(CV_rf.best_score_)

{'max_features': 'sqrt', 'n_estimators': 150}
0.9348837209302326


In [39]:
predictions = CV_rf.predict(X_test)

In [40]:
CV_rf.score(X_test, hot_y_test)

0.9674418604651163