In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline 
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve 

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
FEATURES = ['province','age','education', 'if_urban',
                 'wealth_index_code', 'if_own_house', 'if_own_land',
                 'if_employment', 'if_employment_current','employment_pay_method',
                 'partner_edu', 
                 'num_household', 'num_child','sex_head_household', 'sexual_activity', 'ideal_num_child', 'partner_ideal_child', 'money_decide_person']

TARGET_LST = ['if_emo_vio', 'if_phy_vio', 'if_sex_vio', 'if_vio', 'num_vio']

In [3]:
FILES = ['cleaned_data/cambodia_2014_cleaned.csv', 
         'cleaned_data/Maldives_2016_cleaned.csv', 
         'cleaned_data/Nepal_2016_cleaned.csv',
         'cleaned_data/Pakistan_2017_cleaned.csv',
         'cleaned_data/Philippines_2017_cleaned.csv']

# Pipeline

In [39]:
def split_data(features, target):
#     df.dropna(subset=[target_col],inplace=True)
#     df = fill_categorical_na_vals(df)
#     features = df[features_col]
#     target = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.20, 
                                                    random_state=505)
    return X_train, X_test, y_train, y_test

In [None]:
def preprocess_data(df, features_col, target_col, categorical):
    df.dropna(subset=[target_col],inplace=True)
    df = fill_categorical_na_vals(df)
    features = df[features_col]
    pd.get_dummies(df, columns=col_lst)
    target = df[target_col]
    return features, target

In [44]:
def train_decision_tree(X_train, X_test, y_train, y_test):
    params = {'criterion': ['gini'],
                'max_depth': [3,5,7,9],
                'min_samples_split': [2,5,10]}
    grid_model = GridSearchCV(estimator=DecisionTreeClassifier(random_state=505), 
                              param_grid=params, 
                              cv=10,
                              return_train_score=True,
                              scoring=['f1', 'accuracy','precision','recall','roc_auc'],
                              refit='f1')

    grid_model.fit(X_train, y_train)

    grid_result = pd.DataFrame(grid_model.cv_results_)
    grid_result[['params','mean_train_f1','mean_train_accuracy', 'mean_train_precision','mean_train_recall','mean_train_roc_auc']]
    grid_result.sort_values(by=['mean_train_f1'], ascending=False)
    pd.set_option('max_colwidth',500)
#     print('Best model params: ', grid_result.loc[grid_result['mean_train_accuracy'] == max(grid_result['mean_train_accuracy'])]['params'])

In [6]:
def train_random_forest(X_train, X_test, y_train, y_test):
    params = {'n_estimators':[100, 1000],
              'criterion': ['gini', 'entropy'],
              'max_depth': [3,5,7,9],
              'min_samples_split': [2,5,10]}
    grid_model = GridSearchCV(estimator=RandomForestClassifier(random_state=505), 
                              param_grid=params, 
                              cv=10,
                              return_train_score=True,
                              scoring=['f1', 'accuracy','precision','recall','roc_auc'],
                              refit='f1')

    grid_model.fit(X_train, y_train)

    grid_result = pd.DataFrame(grid_model.cv_results_)
    grid_result[['params','mean_train_f1','mean_train_accuracy', 'mean_train_precision','mean_train_recall','mean_train_roc_auc']]
    grid_result.sort_values(by=['mean_train_f1'], ascending=False)
    pd.set_option('max_colwidth',500)

In [7]:
def evaluate_test(model, X_test, y_test):
    y_pred = model.predict(X_test)
    plot_precision_recall_curve(model, X_test, y_test)
    results_dict = {}
    results_dict['f1'] = metrics.f1_score(y_test, y_pred)
    results_dict['accuracy'] = metrics.accuracy_score(y_test, y_pred)
    results_dict['precision'] = metrics.precision_score(y_test, y_pred)
    results_dict['recall'] = metrics.recall_score(y_test, y_pred)
    results_dict['roc_auc'] = metrics.roc_auc_score(y_test, y_pred)
    plot_precision_recall_curve(model,X_test,y_test)
    return results_dict

In [8]:
def plot_importances(model, n=5, title=''):
    '''
    Compute the relative importance of selected features in
    the model
    
    Inputs:
    - model
    - n (int): top n features, opt
    - title (str)
    '''
    importances = model.feature_importances_
    np_features = np.array(features)
    sorted_idx = np.argsort(importances)[len(np_features)-n:]
    padding = np.arange(len(sorted_idx)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, np_features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title(title)
    pl.show()

In [9]:
def read_data(csv):
    return pd.read_csv(csv)

In [23]:
def fill_categorical_na_vals(df):
    '''
    Find colums and rows with missing values. Print rows, returns list of
    columns.
    '''
    null_columns= df.columns[df.isnull().any()]
    df = df.fillna("don't know")
#     print(df[df.isnull().any(axis=1)][null_columns])
#     print(null_columns)
    return df

# Cambodia

In [40]:
df = read_data('cleaned_data/cambodia_2014_cleaned.csv')
# na_vals(df)
# df.dropna(subset=['if_emo_vio'],inplace=True)
# na_vals(df)


In [45]:
for target in TARGET_LST:
    print("\n Target: ", target)
    X_train, X_test, y_train, y_test = split_data(df, FEATURES, target)
    print (df[target].unique())
    train_decision_tree(X_train, X_test, y_train, y_test)


 Target:  if_emo_vio
[0. 1.]


ValueError: could not convert string to float: 'battambang & pailin'

In [None]:
def plot_importances(df, features, label, n=10, title=''):
    '''
    Build a random forest classifier to
    compute the relative importance of selected features in
    predicting the label.
    
    Inputs:
    - df (pd.DataFrame)
    - features (lst of str)
    - label (str)
    - n (int): top n features, opt
    - title (str)
    '''
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(df[features], df[label])
    importances = clf.feature_importances_
    np_features = np.array(features)
    sorted_idx = np.argsort(importances)[len(np_features)-n:]
    padding = np.arange(len(sorted_idx)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, np_features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title(title)
    pl.show()

[Reference: Decision Tree Ensembles- Bagging and Boosting](https://towardsdatascience.com/decision-tree-ensembles-bagging-and-boosting-266a8ba60fd9)