In [47]:
import pandas as pd
import numpy as np
from Functions import Cleaning_Functions
from sklearn import model_selection, linear_model, neighbors, preprocessing, metrics, ensemble

fun = Cleaning_Functions()

clean = pd.read_csv("../data/clean.csv")
clean  = clean.drop("continent", axis =1)

In [48]:
def standarize_data(df):
        """
        Input: a dataset
        action: returns numeric column values scaled by mean and standard deviation
        """
        numeric_data = df.select_dtypes(include=['float64', 'int64'])
        for i in numeric_data.columns:
            df[i] = (df[i] - df[i].mean())/df[i].std()
        return df

In [49]:
market_data = fun.delete_id_columns(clean) #1
market_data, pred_market = fun.drop_response_rows_with_NAs(market_data, "Market_Orientation", "PPI_Likelihood") #2
market_data = fun.replace_NAN_with_na(market_data) #3
market_data = fun.entry_to_lowercase(market_data) #4
market_data = fun.remove_underscores_spaces(market_data) #5
market_data = fun.convert_to_categorical(market_data) #6
market_data = fun.impute_data(market_data)
market_data = standarize_data(market_data)


In [50]:
#Ana's fuc
def get_dummyXs_y(df, y_var):
    
    y = df[y_var]
    X  = df.drop(y_var, axis = 1)
    X_cat = X.select_dtypes(include = ["category", "O"])
    X_num = X.select_dtypes(include=['float64', 'int64'])
    
    X_cat_dummy = pd.get_dummies(X_cat)
    newX = pd.concat([X_num, X_cat_dummy], axis = 1)
    
    return newX, y

In [51]:
X, y = get_dummyXs_y(market_data, "Market_Orientation")
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size = 0.3, random_state = 50)


In [52]:
def fit_predict(clf, X_tr, X_te, y_tr, y_te):
    clf.fit(X_tr,y_tr)
    pred = clf.predict(X_te)
    mse = metrics.mean_squared_error(y_te, pred)
    
    return "MSE: {} ".format(mse)



def tune_parameters(X_train, y_train, clf, param_dict, cv=5):
    
   
    
    best_model = model_selection.GridSearchCV(clf, param_dict, cv=cv, scoring = "neg_mean_squared_error", n_jobs =-1, verbose=3)
    
    best_model.fit(X_train, y_train)
    
    print("Best Parameters: {} \n Training MSE: {} \n Parameter Index: {}".format(best_model.best_params_,best_model.best_score_,best_model.best_index_) ) # best is alpha = 0


    #uses gridsearch, prints best parameters, best model, its MSE on the training set
    #returns classifer
    
    return clf

test_mse_market = []

Market Orientation
=======

## Random Forest Model

In [53]:
forest_model = ensemble.RandomForestRegressor()
fit_predict(forest_model, X_tr, X_te, y_tr, y_te)

parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50], 'max_features':["auto", "sqrt", "log2"]}

best_forest = tune_parameters( X_tr, y_tr,forest_model, parameters)

forest_pred = best_forest.predict(X_te)
forest_test_mse_market = metrics.mean_squared_error(y_te, forest_pred)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.9min finished


Best Parameters: {'max_depth': 50, 'max_features': 'auto', 'n_estimators': 250} 
 Training MSE: -0.014588005751682892 
 Parameter Index: 27


In [54]:
test_mse.append("Random Forrest Test MSE:{}".format(forest_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, forest_pred)))

NameError: name 'test_mse' is not defined

In [None]:

forest_importances = pd.Series(best_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
forest_importances


## XG Boosting Model

In [None]:
XG_model = ensemble.GradientBoostingRegressor()
fit_predict(XG_model, X_tr, X_te, y_tr, y_te)


parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20,50]}

best_XG = tune_parameters( X_tr, y_tr,XG_model, parameters)

XG_pred = best_XG.predict(X_te)
XG_test_mse = metrics.mean_squared_error(y_te, XG_pred)


test_mse_market.append("XGBoost Test MSE:{}".format(XG_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, XG_pred)))

In [None]:

XG_importances = pd.Series(best_XG.feature_importances_, index=X.columns).sort_values(ascending=False)
XG_importances


PPI_Likelihood
=====

In [None]:
PPI_data = fun.delete_id_columns(clean) #1
PPI_data, pred_PPI = fun.drop_response_rows_with_NAs(PPI_data, "PPI_Likelihood", "Market_Orientation") #2
PPI_data = fun.replace_NAN_with_na(PPI_data) #3
PPI_data = fun.entry_to_lowercase(PPI_data) #4
PPI_data = fun.remove_underscores_spaces(PPI_data) #5
PPI_data = fun.convert_to_categorical(PPI_data) #6
PPI_data = fun.impute_data(PPI_data)
PPI_data = standarize_data(PPI_data)


X, y = get_dummyXs_y(PPI_data, "PPI_Likelihood")
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size = 0.3, random_state = 50)

test_mse_ppi = []

In [None]:
forest_model = ensemble.RandomForestRegressor()
fit_predict(forest_model, X_tr, X_te, y_tr, y_te)

parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50]}

best_forest = tune_parameters( X_tr, y_tr,forest_model, parameters)

forest_pred = best_forest.predict(X_te)
forest_test_mse_ppi = metrics.mean_squared_error(y_te, forest_pred)

test_mse_ppi.append("Random Forrest Test MSE:{}".format(forest_test_mse_ppi))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, forest_pred)))

In [None]:

forest_importances = pd.Series(best_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
forest_importances


In [None]:
XG_model = ensemble.GradientBoostingRegressor()
fit_predict(XG_model, X_tr, X_te, y_tr, y_te)


parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 20)), 'max_depth': [10,20,50]}

best_XG = tune_parameters( X_tr, y_tr,XG_model, parameters)

XG_pred = best_XG.predict(X_te)
XG_test_mse = metrics.mean_squared_error(y_te, XG_pred)

test_mse_ppi.append("XGBoost Test MSE:{}".format(XG_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, XG_pred)))

In [None]:
XG_importances = pd.Series(best_XG.feature_importances_, index=X.columns).sort_values(ascending=False)
XG_importances


## Country Specific PPI Likeilhood

In [None]:
#produces a dictionary of country specific dataframes
country_dict={}
for country in PPI_data["Country"].values.unique():
    new_df = PPI_data[PPI_data["Country"].values  == country]
    country_dict[country] = new_df

In [None]:
#wrapper func
def country_model(country, y, clf, parameter_dict):
    """
    INPUT
    country: str, country name as appears in dataframe
    y: str, column name of response
    clf: scikitlearn clf, the scikit learn model to train 
    parameter_dict: dict, dictionary of model parameters
    
    OUTPUT
    country: str, country name as appears in dataframe
    clf: trained best model
    mse: test mse for this model
    index: the list of dummy varaible columns for that country
    """
    X,y = get_dummyXs_y(country_dict[country], y)
    X_tr,X_te,y_tr,y_te = model_selection.train_test_split(X,y, test_size=0.3, random_state=50)
    
    index = X.columns
    fit_predict(clf, X_tr,X_te,y_tr,y_te)
    
    best_clf = tune_parameters(X_tr, y_tr, clf, parameter_dict)
    
    best_pred = best_clf.predict(X_te)
    mse = metrics.mean_squared_error(y_te, best_pred)
    print(best_clf.feature_importances_.sort())
    print("\n \n {} \n Test MSE: {}".format(country, mse))
    
    return country, best_clf.feature_importances_, mse, index

In [None]:
country_dict["ghana"].shape

In [None]:
forest_model = ensemble.RandomForestRegressor()
parameters= {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50]}

#
#name, ghana_forrest, ghana_mse, index = country_model(country_dict["ghana"], "PPI_Likelihood",forest_model,parameters)
#for key in country_dict:


country_results= {}
for country in PPI_data["Country"].values.unique():
    country_results[country] =country_model(country,
                          "PPI_Likelihood",
                          forest_model,
                          parameters)

In [None]:
country_feature = {}
for country in country_results:
    country_feature[country] = pd.Series(country_results[country][1], 
                                      index=country_results[country][3].values).sort_values(ascending=False)



In [None]:
country_feature[1:10]

In [None]:
def plot_country_feature(country, ax1):
    country_feature[country][1:10].plot(kind="bar", 
                                   title = "Most Important Features: {}".format(country), 
                                   ylabel = "Importance Metric",
                                   xlabel = "Features", ax=ax1)
    
        


In [None]:
for country in country_feature:
    print(country)

In [None]:
import matplotlib.pyplot as plt
f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5), 
                             sharex=False)
plot_country_feature("tanzania", ax1=ax1)
plot_country_feature("guatemala", ax1=ax2)
plot_country_feature("honduras", ax1=ax3)
plot_country_feature("elsalvador", ax1=ax4)

f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5), 
                             sharex=False)
plot_country_feature("mali", ax1=ax1)
plot_country_feature("burkinafaso", ax1=ax2)
plot_country_feature("malawi", ax1=ax3)
plot_country_feature("ethiopia", ax1=ax4)

f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5), 
                             sharex=False)
plot_country_feature("india", ax1=ax1)
plot_country_feature("cambodia", ax1=ax2)
plot_country_feature("vietnam", ax1=ax3)
plot_country_feature("kenya", ax1=ax4)

f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5), 
                             sharex=False)
plot_country_feature("zambia", ax1=ax1)
plot_country_feature("ghana", ax1=ax2)
plot_country_feature("uganda", ax1=ax3)
plot_country_feature("peru", ax1=ax4)

# By Continent 

In [None]:
clean = pd.read_csv("../data/clean.csv")
clean  = clean.drop("Country", axis =1)

PPI_data = fun.delete_id_columns(clean) #1
PPI_data, pred_PPI = fun.drop_response_rows_with_NAs(PPI_data, "PPI_Likelihood", "Market_Orientation") #2
PPI_data = fun.replace_NAN_with_na(PPI_data) #3
PPI_data = fun.entry_to_lowercase(PPI_data) #4
PPI_data = fun.remove_underscores_spaces(PPI_data) #5
PPI_data = fun.convert_to_categorical(PPI_data) #6
PPI_data = fun.impute_data(PPI_data)
PPI_data = standarize_data(PPI_data)


X, y = get_dummyXs_y(PPI_data, "PPI_Likelihood")
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size = 0.3, random_state = 50)

test_mse_ppi = []


In [None]:
#produces a dictionary of country specific dataframes
continent_dict={}
for continent in PPI_data["continent"].values.unique():
    new_df = PPI_data[PPI_data["continent"].values  == continent]
    continent_dict[continent] = new_df

In [None]:
def continent_model(continent, y, clf, parameter_dict):
    """
    INPUT
    continent: str, continent name as appears in dataframe
    y: str, column name of response
    clf: scikitlearn clf, the scikit learn model to train 
    parameter_dict: dict, dictionary of model parameters
    
    OUTPUT
    continent: str, continent name as appears in dataframe
    clf: trained best model
    mse: test mse for this model
    index: the list of dummy varaible columns for that country
    """
    X,y = get_dummyXs_y(continent_dict[continent], y)
    X_tr,X_te,y_tr,y_te = model_selection.train_test_split(X,y, test_size=0.3, random_state=50)
    
    index = X.columns
    fit_predict(clf, X_tr,X_te,y_tr,y_te)
    
    best_clf = tune_parameters(X_tr, y_tr, clf, parameter_dict)
    
    best_pred = best_clf.predict(X_te)
    mse = metrics.mean_squared_error(y_te, best_pred)
    print(best_clf.feature_importances_.sort())
    print("\n \n {} \n Test MSE: {}".format(continent, mse))
    
    return continent, best_clf.feature_importances_, mse, index

In [None]:
forest_model = ensemble.RandomForestRegressor()
parameters= {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50]}

#
#name, ghana_forrest, ghana_mse, index = country_model(country_dict["ghana"], "PPI_Likelihood",forest_model,parameters)
#for key in country_dict:


continent_results= {}
for continent in PPI_data["continent"].values.unique():
    continent_results[continent] =continent_model(continent,
                          "PPI_Likelihood",
                          forest_model,
                          parameters)

In [None]:
continent_feature = {}
for continent in continent_results:
    continent_feature[continent] = pd.Series(continent_results[continent][1], 
                                      index=continent_results[continent][3].values).sort_values(ascending=False)


In [None]:
def plot_continent_feature(continent, ax1):
    continent_feature[continent][1:10].plot(kind="bar", 
                                   title = "Most Important Features: {}".format(continent), 
                                   ylabel = "Importance Metric",
                                   xlabel = "Features",
                                            ax=ax1)
    
        


In [None]:
import matplotlib.pyplot as plt
f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5), 
                             sharex=False)
plot_continent_feature("africa", ax1=ax1)
plot_continent_feature("centralamerica", ax1=ax2)
plot_continent_feature("asia", ax1=ax3)
plot_continent_feature("southamerica", ax1=ax4)

