In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from scipy import stats

import hazelbean as hb
L = hb.get_logger()

import seaborn as sns
import matplotlib 
%matplotlib inline

import matplotlib.pyplot as plt

In [117]:
import xgboost as xgb

# Load dataset

In [46]:
L.info('Loading data')
baseline_df = pd.read_csv('../IPBES project/intermediate/with_climate_zones/baseline_regression_data.csv')
L.info('Data loaded')

climate_zones_map = {1:'Af',2:'Am',3:'Aw',
                     5:'BWk',4:'BWh',7:'BSk',6:'BSh',
                     14:'Cfa',15:'Cfb',16:'Cfc',8:'Csa',
                     9:'Csb',10:'Csc',11:'Cwa',12:'Cwb',13:'Cwc',
                     25:'Dfa',26:'Dfb',27:'Dfc',28:'Dfd',17:'Dsa',18:'Dsb',19:'Dsc',
                     20:'Dsd',21:'Dwa',22:'Dwb',23:'Dwc',24:'Dwd',
                     30:'EF',29:'ET'}
baseline_df['climate_zone'] = baseline_df['climate_zones'].map(climate_zones_map)
baseline_df = baseline_df.drop(['Unnamed: 0','climate_zones'], axis=1)


Loading data
Data loaded


# 2 Save validation set

In [None]:
x = baseline_df.drop(['calories_per_cell'], axis=1)
y = baseline_df['calories_per_cell']

X, X_validation, Y, y_validation = train_test_split(x, y)

#### OR: Select a subset for faster computation (means the rest is a validation set!)

In [85]:
subset = baseline_df.sample(frac=0.01, replace=False, weights=None, random_state=None, axis=0)
df = subset



# 3 Data transformation 

### (a) Log

Problem: distribution super highly skewed left, stats test makes assumptions of normality.. :/


But first create a binary (has_ag) column for pixels where calories = 0 (because log doesn't deal with zeros)

In [90]:
dfLog.head()

Unnamed: 0,calories_per_cell,precip,temperature,gdp_2000,minutes_to_market,workability,toxicity,rooting_conditions,protected_areas,oxygen_availability,...,gdp_gecon,slope,altitude,c3_annual_calories,c3_perennial_calories,c4_annual_calories,c4_perennial_calories,nitrogen_fixer_calories,climate_zone,has_ag
245372,"(-4.905, 3.06]",364.0,0.0,3870.412333,1558.0,0.0,0.0,0.0,1.0,0.0,...,28973.035,89.2457,28.0,0.0,0.0,0.0,0.0,0.0,,0
1279363,"(-4.905, 3.06]",270.0,191.0,1595.766665,146.0,1.0,1.0,1.0,1.0,1.0,...,3371.3179,89.93965,375.0,0.0,0.0,0.0,0.0,0.0,,0
2165377,"(18.991, 26.956]",972.0,160.0,5398.717829,66.0,1.0,1.0,1.0,1.0,1.0,...,6124.1035,88.89099,79.0,14246975.0,0.0,20052628.0,0.0,8930732.0,EF,1
1512967,"(-4.905, 3.06]",35.0,276.0,12088.92318,1162.0,1.0,1.0,1.0,0.5,1.0,...,15752.66,89.38901,132.0,0.0,0.0,0.0,0.0,0.0,,0
996418,"(-4.905, 3.06]",109.0,44.0,768.882875,574.0,1.0,1.0,1.0,1.0,1.0,...,2281.1865,89.88089,1275.0,0.0,0.0,0.0,0.0,0.0,BWh,0


In [86]:
df.head()

Unnamed: 0,calories_per_cell,precip,temperature,gdp_2000,minutes_to_market,workability,toxicity,rooting_conditions,protected_areas,oxygen_availability,...,excess_salts,gdp_gecon,slope,altitude,c3_annual_calories,c3_perennial_calories,c4_annual_calories,c4_perennial_calories,nitrogen_fixer_calories,climate_zone
1231542,12533350000.0,428.0,122.0,2554.036439,492.0,0.5,1.0,0.5,1.0,1.0,...,1.0,5799.252,89.966896,1777.0,8092959.0,0.0,0.0,0.0,29539.332,
648637,0.0,488.0,0.0,33372.029454,715.0,1.0,1.0,1.0,1.0,0.75,...,1.0,28856.229,89.81865,623.0,0.0,0.0,0.0,0.0,0.0,Dfb
164900,0.0,103.0,0.0,33372.029454,5182.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,89.60349,107.0,0.0,0.0,0.0,0.0,0.0,Dfc
1380316,0.0,33.0,198.0,1595.766665,342.0,0.0,0.0,0.0,1.0,0.0,...,0.0,2009.4701,89.788994,149.0,0.0,0.0,0.0,0.0,0.0,
1184546,0.0,164.0,132.0,2554.036439,378.0,0.5,0.75,0.5,1.0,0.75,...,0.75,7152.732,89.97898,1606.0,0.0,0.0,0.0,0.0,0.0,Am


In [92]:
dfLog = df

dfLog['has_ag'] = pd.Series(len(dfLog['calories_per_cell']), index=dfLog.index)
dfLog['has_ag'] = 0 
dfLog.loc[dfLog['calories_per_cell']>0,'has_ag'] = 1

dfLog.loc[dfLog['has_ag']==1,'calories_per_cell'] = np.log(dfLog['calories_per_cell'])
 
## Why is the code above changing not only dfLog but also df itself ?! WTF??!
## Till I fugure this out - re-run subset box before each data transfo!




### (b) Binning

In [87]:
dfBin = df

#Make bin 0 for no ag cells
dfBin['has_ag'] = pd.Series(len(dfBin['calories_per_cell']), index=dfBin.index)
dfBin['has_ag'] = 0 
dfBin.loc[dfBin['calories_per_cell']>0,'has_ag'] = 1

# Make bins 1-5 for ag cells
dfBin[dfBin['has_ag']==1,'calories_per_cell'] = pd.cut(dfBin[dfBin['has_ag']==1,'calories_per_cell'],bins=5,labels=[1,2,3,4,5])

## Why ?!

TypeError: 'Series' objects are mutable, thus they cannot be hashed

### (c) Log + binning

In [93]:
dfLogBin = dfLog

dfLogBin['calories_per_cell'] = pd.cut(dfLogBin['calories_per_cell'], 5,labels=[1,2,3,4,5])

## Same Why is the code above changing not only dfLogBin but also dfLog itself ?! WTF??!
## Till I fugure this out - re-run subset box before each data transfo!


# 4 Train/test split

In [None]:
#included in models fctions

###  For reference : Train/Test Split
x = dataframe.drop(['calories_per_cell'], axis=1)
y = dataframe['calories_per_cell']
X_train, X_test, y_train, y_test = train_test_split(x, y)


# 5a Models: Linear Regression

In [125]:
def linreg(dataframe):
    ##Must make dummies for categorical variable climate_zone
    dataframe = pd.get_dummies(dataframe, columns=['climate_zone'])
    #Or just drop column if don't want dummies: x = x.drop(['climate_zone'], axis=1)

    x = dataframe.drop(['calories_per_cell'], axis=1)
    y = dataframe['calories_per_cell']

    ### Linear Regression
    lr = LinearRegression()


    ### Cross validation R2 score

    cv_scores = cross_val_score(lr, x, y, cv=10)

    print('Mean R2 score: ', np.mean(cv_scores))

### Without climate_zone dummies

In [40]:
linreg(dfBin)

Mean R2 score:  0.20252177899190257


In [83]:
linreg(dfLog)

Mean R2 score:  0.9856761717237082


In [94]:
linreg(dfLogBin)

Mean R2 score:  0.9765347103743824


### With climate_zone dummies


In [126]:
linreg(dfBin)

Mean R2 score:  0.9765885224042776


In [109]:
linreg(dfLog)

Mean R2 score:  0.9765885224042776


In [110]:
linreg(dfLogBin)

Mean R2 score:  0.9765885224042776


In [111]:
## Why ? Clearly I have a problem of variable assignments where dfLog, dfBin and dfLogBin are all the same df hum hum hum

# 5b Models: XGBoost

In [114]:
def xgboost_reg(dataframe):
    ##Must make dummies for categorical variable climate_zone
    #dataframe = pd.get_dummies(dataframe, columns=['climate_zone'])
    #Or just drop column if don't want dummies: x = x.drop(['climate_zone'], axis=1)

    x = dataframe.drop(['calories_per_cell'], axis=1)
    y = dataframe['calories_per_cell']

    ### XGBoost Regressor
    xgbreg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

    ### Cross validation R2 score

    cv_scores = cross_val_score(xgbreg, x, y, cv=10)

    print('Mean R2 score: ', np.mean(cv_scores))

In [115]:
xgboost_reg(dfBin)

AttributeError: module 'xgboost' has no attribute 'XGBRegressor'

In [119]:
from xgboost import XGBRegressor

ImportError: cannot import name 'XGBRegressor'