In [2]:
import pandas as pd
import numpy as np
from Functions import Cleaning_Functions
from sklearn import model_selection, linear_model, neighbors, preprocessing, metrics, ensemble

fun = Cleaning_Functions()

clean = pd.read_csv("../data/clean.csv")


In [62]:
def standarize_data(df):
        """
        Input: a dataset
        action: returns numeric column values scaled by mean and standard deviation
        """
        numeric_data = df.select_dtypes(include=['float64', 'int64'])
        for i in numeric_data.columns:
            df[i] = (df[i] - df[i].mean())/df[i].std()
        return df

In [63]:
market_data = fun.delete_id_columns(clean) #1
market_data, pred_market = fun.drop_response_rows_with_NAs(market_data, "Market_Orientation", "PPI_Likelihood") #2
market_data = fun.replace_NAN_with_na(market_data) #3
market_data = fun.entry_to_lowercase(market_data) #4
market_data = fun.remove_underscores_spaces(market_data) #5
market_data = fun.convert_to_categorical(market_data) #6
market_data = fun.impute_data(market_data)
market_data = standarize_data(market_data)


In [64]:
#Ana's fuc
def get_dummyXs_y(df, y_var):
    
    y = df[y_var]
    X  = df.drop(y_var, axis = 1)
    X_cat = X.select_dtypes(include = ["category", "O"])
    X_num = X.select_dtypes(include=['float64', 'int64'])
    
    X_cat_dummy = pd.get_dummies(X_cat)
    newX = pd.concat([X_num, X_cat_dummy], axis = 1)
    
    return newX, y

In [65]:
X, y = get_dummyXs_y(market_data, "Market_Orientation")
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size = 0.3, random_state = 50)


In [66]:
def fit_predict(clf, X_tr, X_te, y_tr, y_te):
    clf.fit(X_tr,y_tr)
    pred = clf.predict(X_te)
    mse = metrics.mean_squared_error(y_te, pred)
    
    return "MSE: {} ".format(mse)



def tune_parameters(X_train, y_train, clf, param_dict, cv=5):
    
   
    
    best_model = model_selection.GridSearchCV(clf, param_dict, cv=cv, scoring = "neg_mean_squared_error", n_jobs =-1, verbose=3)
    
    best_model.fit(X_train, y_train)
    
    print("Best Parameters: {} \n Training MSE: {} \n Parameter Index: {}".format(best_model.best_params_,best_model.best_score_,best_model.best_index_) ) # best is alpha = 0


    #uses gridsearch, prints best parameters, best model, its MSE on the training set
    #returns classifer
    
    return clf

test_mse_market = []

Market Orientation
=======

## Random Forest Model

In [67]:
forest_model = ensemble.RandomForestRegressor()
fit_predict(forest_model, X_tr, X_te, y_tr, y_te)

parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50], 'max_features':["auto", "sqrt", "log2"]}

best_forest = tune_parameters( X_tr, y_tr,forest_model, parameters)

forest_pred = best_forest.predict(X_te)
forest_test_mse_market = metrics.mean_squared_error(y_te, forest_pred)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.7min finished


Best Parameters: {'max_depth': 50, 'max_features': 'auto', 'n_estimators': 200} 
 Training MSE: -0.010516094751742087 
 Parameter Index: 26


In [68]:
test_mse.append("Random Forrest Test MSE:{}".format(forest_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, forest_pred)))

Test MSE: 0.004644118607085156


In [69]:

forest_importances = pd.Series(best_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
forest_importances


value_livestock_prod_consumed_USD_PPP_pHH_Yr    0.427205
farm_income_USD_PPP_pHH_Yr                      0.303100
value_crop_consumed_USD_PPP_pHH_Yr              0.127582
total_income_USD_PPP_pHH_Yr                     0.126198
value_farm_produce_USD_PPP_pHH_Yr               0.004343
                                                  ...   
crop_name_1_broadbeans                          0.000000
crop_name_1_achiote                             0.000000
crop_name_1_watermelon                          0.000000
crop_name_1_zucchini                            0.000000
crop_name_1_chickpeas                           0.000000
Length: 159, dtype: float64

## XG Boosting Model

In [70]:
XG_model = ensemble.GradientBoostingRegressor()
fit_predict(XG_model, X_tr, X_te, y_tr, y_te)


parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20,50]}

best_XG = tune_parameters( X_tr, y_tr,XG_model, parameters)

XG_pred = best_XG.predict(X_te)
XG_test_mse = metrics.mean_squared_error(y_te, XG_pred)


test_mse_market.append("XGBoost Test MSE:{}".format(XG_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, XG_pred)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.9min finished


Best Parameters: {'max_depth': 10, 'n_estimators': 250} 
 Training MSE: -0.012467946363262442 
 Parameter Index: 3
Test MSE: 0.021677865365120983


In [71]:

XG_importances = pd.Series(best_XG.feature_importances_, index=X.columns).sort_values(ascending=False)
XG_importances


value_livestock_prod_consumed_USD_PPP_pHH_Yr    0.449191
farm_income_USD_PPP_pHH_Yr                      0.322268
value_crop_consumed_USD_PPP_pHH_Yr              0.131396
total_income_USD_PPP_pHH_Yr                     0.068298
value_farm_produce_USD_PPP_pHH_Yr               0.018361
                                                  ...   
crop_name_1_yam                                 0.000000
crop_name_1_wheat                               0.000000
crop_name_1_watermelon                          0.000000
crop_name_1_tomato                              0.000000
crop_name_1_pigeonpea                           0.000000
Length: 159, dtype: float64

PPI_Likelihood
=====

In [72]:
PPI_data = fun.delete_id_columns(clean) #1
PPI_data, pred_PPI = fun.drop_response_rows_with_NAs(PPI_data, "PPI_Likelihood", "Market_Orientation") #2
PPI_data = fun.replace_NAN_with_na(PPI_data) #3
PPI_data = fun.entry_to_lowercase(PPI_data) #4
PPI_data = fun.remove_underscores_spaces(PPI_data) #5
PPI_data = fun.convert_to_categorical(PPI_data) #6
PPI_data = fun.impute_data(PPI_data)
PPI_data = standarize_data(PPI_data)


X, y = get_dummyXs_y(PPI_data, "PPI_Likelihood")
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X,y, test_size = 0.3, random_state = 50)

test_mse_ppi = []

In [73]:
forest_model = ensemble.RandomForestRegressor()
fit_predict(forest_model, X_tr, X_te, y_tr, y_te)

parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 50)), 'max_depth': [10,20, 50]}

best_forest = tune_parameters( X_tr, y_tr,forest_model, parameters)

forest_pred = best_forest.predict(X_te)
forest_test_mse_ppi = metrics.mean_squared_error(y_te, forest_pred)

test_mse_ppi.append("Random Forrest Test MSE:{}".format(forest_test_mse_ppi))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, forest_pred)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  6.8min finished


Best Parameters: {'max_depth': 20, 'n_estimators': 200} 
 Training MSE: -0.38538118741894906 
 Parameter Index: 6
Test MSE: 0.3931938260734791


In [74]:

forest_importances = pd.Series(best_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
forest_importances


Country_mali                0.163262
Country_zambia              0.142931
LivestockHoldings           0.056198
continent_africa            0.052077
HHsizemembers               0.044168
                              ...   
crop_name_1_custardapple    0.000000
crop_name_1_coriander       0.000000
crop_name_1_peas            0.000000
crop_name_1_lemons          0.000000
crop_name_1_tobacco         0.000000
Length: 154, dtype: float64

In [75]:
XG_model = ensemble.GradientBoostingRegressor()
fit_predict(XG_model, X_tr, X_te, y_tr, y_te)


parameters = dict()
parameters = {'n_estimators':(np.arange(100, 300, 20)), 'max_depth': [10,20,50]}

best_XG = tune_parameters( X_tr, y_tr,XG_model, parameters)

XG_pred = best_XG.predict(X_te)
XG_test_mse = metrics.mean_squared_error(y_te, XG_pred)

test_mse_ppi.append("XGBoost Test MSE:{}".format(XG_test_mse))

print("Test MSE: {}".format(metrics.mean_squared_error(y_te, XG_pred)))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 16.4min finished


Best Parameters: {'max_depth': 10, 'n_estimators': 120} 
 Training MSE: -0.3860526893646002 
 Parameter Index: 1
Test MSE: 0.4070801718035054


In [76]:
XG_importances = pd.Series(best_XG.feature_importances_, index=X.columns).sort_values(ascending=False)
XG_importances


Country_mali                0.250837
Country_zambia              0.215757
continent_africa            0.073566
HHsizemembers               0.066270
Country_malawi              0.059080
                              ...   
crop_name_1_sugarcane       0.000000
crop_name_1_sunflower       0.000000
crop_name_1_sweetpotato     0.000000
crop_name_1_tea             0.000000
crop_name_1_passionfruit    0.000000
Length: 154, dtype: float64

## Country Specific PPI Likeilhood