In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
%matplotlib inline
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('../input/amphibians-data-set/dataset.csv',delimiter=';')
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

# EDA

In [3]:
df.head()

In [4]:
df.info()

In [5]:
df.describe()

In [6]:
for feature in df.columns:
    print(feature,df[feature].isnull().sum(),'of',df.shape[0],'values are missing')

We have no missing values in our dataset.

Numerical Variables: ID, SR, NR, OR. <br>
Categorical Variables: all the rest. Some of them are ordinal.

In [91]:
numerical_variables = ['ID','SR','NR']
categorical_variables = list(set(df.columns)-set(numerical_variables))

In [8]:
category2 = ["SUR1", "SUR2", "SUR3"]
for c in category2:
    print("{} \n".format(df[c].value_counts()))

In [5]:
ds = df.copy()  

In [6]:
from plotnine import options
options.figure_size = (6,2)
for i in categorical_variables:
    ds[i] = sorted(ds[i])
    (print(   
            ggplot(ds)
            +aes(i,fill = 'Motorway')
            +geom_bar(width=0.3)
            +ggtitle(''))
    ) 

In [None]:
ds.columns

In [8]:
species = ['Green frogs','Brown frogs','Common toad','Fire-bellied toad','Tree frog',\
    'Common newt','Great crested newt']
df['Species'] = df[species].astype(np.int).sum(axis=1)
df=df.drop(species, axis=1)

In [9]:
data_copy = df.copy()

data_copy['Motorway'] = data_copy['Motorway'].replace('A1', 1)
data_copy['Motorway'] = data_copy['Motorway'].replace('S52', 2)
data_copy = data_copy.apply(pd.to_numeric) 
data_copy = data_copy.drop(['ID'],axis=1)
plt.figure(figsize=(16, 6))
ax = sns.heatmap(data_copy.corr(),vmin=-1,vmax=1, annot=True)

- Highly positively correlated features:<br>NR and SR, FR and UR, RR and BR.
- Highly negatively correlated features:<br>UR and VR, FR and VR. 
- Species are weakly correlated with VR, SUR1, SUR2, RR, BR.

In [None]:
df.head()

In [10]:
data = df.copy()

data['Motorway'] = data['Motorway'].replace('A1', 1)
data['Motorway'] = data['Motorway'].replace('S52', 2)

#ax = sns.heatmap(data, annot=True)

In [11]:
data = data.apply(pd.to_numeric) 

In [12]:
data = data.drop(['ID'],axis=1)

## Checking for outliers

In [13]:
def detect_outliers(df2,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df2[c],25)
        # 3rd quartile
        Q3 = np.percentile(df2[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5

        # detect outlier and their indeces
        outlier_list_col = df2[(df2[c] < Q1 - outlier_step) | \
                               (df2[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [35]:
df.loc[detect_outliers(data,data.columns)]

In [16]:
df.drop(detect_outliers(data,data.columns),inplace=True)

## Labelling categorical data

In [62]:
df2 = df.copy() #Using df2 to label encoding

In [93]:
for i in species:
    try:
        categorical_variables.remove(i)
    except:
        continue
categorical_variables.remove('Motorway')

In [94]:
df2[categorical_variables] = df[categorical_variables].astype('int')

In [95]:
df2['TR'].replace({1:'natural reservoirs', 2:'recently formed', 5:'technological',
                   7:'garden',11:'trenches', 12:'wet meadows', 14:'river valleys',
                   15:'small watercourses'}, inplace=True)
df2['VR'].replace({0:'no vegetation', 1:'patches at the edges', 2:'heavily overgrown',
                   3:'some part devoid of vegetation',4:'reservoirs completely overgrown'},\
                  inplace=True)
for i in ['SUR1','SUR2','SUR3']:
    df2[i].replace({1:'forest areas', 2:'meadows', 4:'gardens',
                   6:'industrial areas',10:'river valleys', 7:'orchards', 9:'roads',
                   14:'agricultural'}, inplace=True)
df2['UR'].replace({0:'unused', 1:'scenic', 3:'technological'}, inplace=True)
df2['FR'].replace({0:'lack', 1:'intense fishing', 2:'breeding reservoirs',\
                   3:'remove',4:'remove'}, inplace=True)
df2['OR'].replace({25:'poor access', 50:'low access', 75:'medium access',100:'large access',\
                   99:'remove',80:'remove'}, inplace=True)
df2['RR'].replace({0:'<50 m', 1:'50-100 m', 2:'100-200 m',5:'200-500 m',9:'500-1000 m',\
                   10:'>10000'}, inplace=True)
df2['BR'].replace({0:'<50 m', 1:'50-100 m', 2:'100-200 m',5:'200-500 m',9:'500-1000 m',\
                   10:'>10000'}, inplace=True)
df2['MR'].replace({0:'Clean', 1:'slightly littered', 2:'heavily littered'}, inplace=True)
df2['CR'].replace({1:'Natural', 2:'Concrete'}, inplace=True)

In [96]:
df2.head()

In [129]:
df3 = pd.get_dummies(df2.drop(['ID','SR','NR','Species'],axis=1))
frames = [df2[['ID','SR','NR','Species']],df3]
df_res = pd.concat(frames,axis=1)

In [130]:
df_res.head()

In [151]:
#%%time
binary_data=df_res.loc[:,'TR_garden':]

for i, col in enumerate(binary_data.columns):
    #plt.figure(i,figsize=(6,4))
    #sns.countplot(x=col, hue=df_res['Species'] ,data=df_res, palette="rainbow")
    #plt.show()
    (print(   
            ggplot(df_res,aes(col,fill='Species'))
            #+aes(col,fill='Species',color='Species')
            #+geom_bar()
            #+facet_wrap('~Species')
            #+ggtitle(''))
            + geom_bar()
            + geom_text(
            aes(label=after_stat('count'),fill='Species'),
            stat='count',
            #nudge_y=0.125,
            va='bottom')
    )) 

In [115]:
df_res.shape

In [107]:
numerical = df_res[["SR","NR","Species"]]
categorical=df_res.drop(["SR","ID","NR","Species"],axis=1)

scaler = StandardScaler()
numerical = pd.DataFrame(scaler.fit_transform(numerical))
numerical.columns = ["SR","NR","Species"]

In [119]:
df_new = pd.concat([df['ID'],numerical, categorical], axis=1, join='inner')

In [120]:
df_new.head()

In [125]:
binary_data=df_new.loc[:,'TR_garden':]

In [128]:
%%time
for i, col in enumerate(binary_data.columns):
    plt.figure(i,figsize=(6,4))
    sns.countplot(x=col, hue=df_new['Species'] ,data=df_new, palette="rainbow")
    plt.show()