In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
%matplotlib inline
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../input/amphibians-data-set/dataset.csv',delimiter=';')
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

# EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
for feature in df.columns:
    print(feature,df[feature].isnull().sum(),'of',df.shape[0],'values are missing')

We have no missing values in our dataset.

Numerical Variables: ID, SR, NR, OR. <br>
Categorical Variables: all the rest. Some of them are ordinal.

In [3]:
numerical_variables = ['ID','SR','NR']
categorical_variables = list(set(df.columns)-set(numerical_variables))

In [None]:
category2 = ["SUR1", "SUR2", "SUR3"]
for c in category2:
    print("{} \n".format(df[c].value_counts()))

In [4]:
ds = df.copy()  

In [None]:
from plotnine import options
options.figure_size = (6,2)
for i in categorical_variables:
    ds[i] = sorted(ds[i])
    (print(   
            ggplot(ds)
            +aes(i,fill = 'Motorway')
            +geom_bar(width=0.3)
            +ggtitle(''))
    ) 

In [None]:
ds.columns

In [5]:
species = ['Green frogs','Brown frogs','Common toad','Fire-bellied toad','Tree frog',\
    'Common newt','Great crested newt']
df['Species'] = df[species].astype(np.int).sum(axis=1)
df=df.drop(species, axis=1)

In [6]:
data_copy = df.copy()

data_copy['Motorway'] = data_copy['Motorway'].replace('A1', 1)
data_copy['Motorway'] = data_copy['Motorway'].replace('S52', 2)
data_copy = data_copy.apply(pd.to_numeric) 
data_copy = data_copy.drop(['ID'],axis=1)
plt.figure(figsize=(16, 6))
ax = sns.heatmap(data_copy.corr(),vmin=-1,vmax=1, annot=True)

- Highly positively correlated features:<br>NR and SR, FR and UR, RR and BR.
- Highly negatively correlated features:<br>UR and VR, FR and VR. 
- Species are weakly correlated with VR, SUR1, SUR2, RR, BR.

In [None]:
df.head()

In [7]:
data = df.copy()

data['Motorway'] = data['Motorway'].replace('A1', 1)
data['Motorway'] = data['Motorway'].replace('S52', 2)

#ax = sns.heatmap(data, annot=True)

In [8]:
data = data.apply(pd.to_numeric) 

In [9]:
data = data.drop(['ID'],axis=1)

## Checking for outliers

In [10]:
def detect_outliers(df2,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df2[c],25)
        # 3rd quartile
        Q3 = np.percentile(df2[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5

        # detect outlier and their indeces
        outlier_list_col = df2[(df2[c] < Q1 - outlier_step) | \
                               (df2[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [11]:
df.loc[detect_outliers(data,data.columns)]

In [12]:
df.drop(detect_outliers(data,data.columns),inplace=True)

## Labelling categorical data

In [13]:
df2 = df.copy() #Using df2 to label encoding

In [14]:
for i in species:
    try:
        categorical_variables.remove(i)
    except:
        continue
categorical_variables.remove('Motorway')

In [15]:
df2[categorical_variables] = df[categorical_variables].astype('int')

In [16]:
df2['TR'].replace({1:'natural reservoirs', 2:'recently formed', 5:'technological',
                   7:'garden',11:'trenches', 12:'wet meadows', 14:'river valleys',
                   15:'small watercourses'}, inplace=True)
df2['VR'].replace({0:'no vegetation', 1:'patches at the edges', 2:'heavily overgrown',
                   3:'some part devoid of vegetation',4:'reservoirs completely overgrown'},\
                  inplace=True)
for i in ['SUR1','SUR2','SUR3']:
    df2[i].replace({1:'forest areas', 2:'meadows', 4:'gardens',
                   6:'industrial areas',10:'river valleys', 7:'orchards', 9:'roads',
                   14:'agricultural'}, inplace=True)
df2['UR'].replace({0:'unused', 1:'scenic', 3:'technological'}, inplace=True)
df2['FR'].replace({0:'lack', 1:'intense fishing', 2:'breeding reservoirs',\
                   3:'remove',4:'remove'}, inplace=True)
df2['OR'].replace({25:'poor access', 50:'low access', 75:'medium access',100:'large access',\
                   99:'remove',80:'remove'}, inplace=True)
df2['RR'].replace({0:'<50 m', 1:'50-100 m', 2:'100-200 m',5:'200-500 m',9:'500-1000 m',\
                   10:'>10000'}, inplace=True)
df2['BR'].replace({0:'<50 m', 1:'50-100 m', 2:'100-200 m',5:'200-500 m',9:'500-1000 m',\
                   10:'>10000'}, inplace=True)
df2['MR'].replace({0:'Clean', 1:'slightly littered', 2:'heavily littered'}, inplace=True)
df2['CR'].replace({1:'Natural', 2:'Concrete'}, inplace=True)

In [None]:
df2.head()

In [17]:
df3 = pd.get_dummies(df2.drop(['ID','SR','NR','Species'],axis=1))
frames = [df2[['ID','SR','NR','Species']],df3]
df_res = pd.concat(frames,axis=1)

In [18]:
df_res.head()

Remake plotnine visualization

In [None]:
#%%time
binary_data=df_res.loc[:,'TR_garden':]

for i, col in enumerate(binary_data.columns):
    #plt.figure(i,figsize=(6,4))
    #sns.countplot(x=col, hue=df_res['Species'] ,data=df_res, palette="rainbow")
    #plt.show()
    (print(   
            ggplot(df_res,aes(col,fill='Species'))
            #+aes(col,fill='Species',color='Species')
            #+geom_bar()
            #+facet_wrap('~Species')
            #+ggtitle(''))
            + geom_bar()
            + geom_text(
            aes(label=after_stat('count'),fill='Species'),
            stat='count',
            #nudge_y=0.125,
            va='bottom')
    )) 

In [None]:
df_res.shape

In [19]:
numerical = df_res[["SR","NR","Species"]]
categorical=df_res.drop(["SR","ID","NR","Species"],axis=1)

scaler = StandardScaler()
numerical = pd.DataFrame(scaler.fit_transform(numerical))
numerical.columns = ["SR","NR","Species"]

In [20]:
df_new = pd.concat([df['ID'],numerical, categorical], axis=1, join='inner')

In [21]:
df_new.head()

In [22]:
binary_data=df_new.loc[:,'TR_garden':]

In [24]:
df[categorical_variables] = df[categorical_variables].astype('int')
df['SR'] = df['SR'].astype('int')
df['NR'] = df['NR'].astype('int')
data=df.drop(["ID"],axis=1)

In [None]:
data.head()

In [26]:
plt.figure(figsize=(7,4))
#sns.heatmap(data.corr())
sns.heatmap(data.corr(),cmap='magma',linecolor='white',annot=False,linewidths=1)

In [27]:
df_new = df_new.drop(["TR_garden","Motorway_A1","Motorway_S52","FR_remove",\
                   "OR_remove"],axis=1)

In [28]:
y=df_new['Species']
x=df_new.drop(["Species"],axis=1)
X=pd.DataFrame(x)

# Modelling

## XGBoost with Label Encoding

- df - for Label encoding
- df_new - one-hot encoding

In [29]:
import xgboost as xgb
from xgboost import XGBRegressor

In [30]:
df.head()

In [31]:
y = df["Species"]
x = df.drop(["Species","ID","Motorway"],axis=1)

In [32]:
x.head()

In [33]:
plt.figure(figsize=(7,5))
sns.distplot(y)
plt.show()

In [34]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [35]:
xgb = XGBRegressor().fit(x_train, y_train)

In [36]:
y_pred = xgb.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

RMSE = 1.8810549053444434

In [None]:
xgb

In [37]:
xgb_grid = {
    'colsample_bytree': [0.4, 0.5, 0.6, 0.9 ,1],
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.5]
}

In [None]:
xgb = XGBRegressor()
xgb_cv = GridSearchCV(xgb, 
                     param_grid = xgb_grid,
                     cv=10,
                     n_jobs = -1,
                     verbose = 2)
xgb_cv.fit(x_train, y_train)

In [38]:
xgb_cv.best_params_

In [39]:
xgb_tuned = XGBRegressor(colsample_bytree = 0.4,
                         learning_rate = 0.01,
                         max_depth = 2,
                         n_estimators = 500)
xgb_tuned = xgb_tuned.fit(x_train, y_train)

In [40]:
y_pred = xgb_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

RMSE = 1.5419139624348617

In [41]:
Importance=pd.DataFrame({"Importance":xgb_tuned.feature_importances_*100},
                       index = x_train.columns)

In [47]:
Importance

Type of water reservoirs is the most significant feature. 

In [48]:
Importance.sort_values(by="Importance",
                      axis=0,
                      ascending=True).plot(kind="barh",color="green")
plt.xlabel("Importance level of values")

In [None]:
model=sm.OLS(xgb_tuned.predict(x_test), x_test)
model.fit().summary()