# Penguins classification

#### Authors: Marková, Pěstová, Pronevich, Sokol



Importing all needed packages.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

seed = 234

## Data reading and understanding

In [None]:
path_to_data = "../data/raw.csv"

try:
    df_penguins = pd.read_csv(path_to_data)
except:
    df_penguins = pd.read_csv('https://raw.githubusercontent.com/hmarkova/DataX_2023/main/data/raw.csv')


In [None]:
df_penguins

In [None]:
df_penguins.describe(include="all") 

In [None]:
df_penguins.dtypes

As can be see, there are wrong data types. 

In [None]:
df_penguins = df_penguins.convert_dtypes()
df_penguins["species"] = df_penguins["species"].astype("category")
df_penguins["island"] = df_penguins["island"].astype("category")
df_penguins["sex"] = df_penguins["sex"].astype("category")
df_penguins["year"] = df_penguins["year"].astype("category")
df_penguins.dtypes

There are also some missing values and duplicated rows, therefore all of these rows will be dropped from the dataset.

After dropping the rows new dataset has 377 rows.

In [None]:
print("Number of dupliated rows: ",df_penguins.duplicated().sum())
print("\nNumber of missing values in each column: ")
print(df_penguins.isna().sum())

df_penguins = df_penguins.drop_duplicates()
df_penguins = df_penguins.dropna()

## Data visualization

Distribution of the target variable Species.

In [None]:
species = df_penguins["species"].value_counts()
plt.bar(species.index, species.values , align = "center")
plt.show()

Distribution of other categorial 

In [None]:
island = df_penguins["island"].value_counts()
sex = df_penguins["sex"].value_counts()

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].bar(island.index, island.values, align="center") 
axs[0].set_title("Island")
axs[0].set_ylabel("Count")

axs[1].bar(sex.index, sex.values, align="center") 
axs[1].set_title("Sex")
axs[1].set_ylabel("Count")

Proportion of species on each island

In [None]:
grouped = df_penguins.groupby(["island", "species"]).size().unstack(fill_value=0)
grouped.plot(kind="bar", stacked=True)

Histograms and scatter plosts of the continuous variable, all colored by type of the species.

In [None]:
sns.pairplot(df_penguins, hue="species",diag_kind="hist",height=3)
plt.show()

We also made a corellation matrix. At first glance we can see a very high positive dependence of the variables flipper_length_mm and body_mass_g.

In [None]:
corr = df_penguins.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

Getting dummies

In [None]:
df_penguins = pd.get_dummies(df_penguins, columns=["island","sex","year"], drop_first=True)

Saving processed data

In [None]:
df_penguins.to_csv('../data/processed.csv', index=False)

## Splitting dataset into training (70%), validation (15%) and test (15%) set.

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(df_penguins.drop('species', axis=1), df_penguins['species'], test_size=0.15, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=seed)

Normalizing the continues data. The normalization scale was created on training data only to avoid data leakage.

In [None]:
norm = MinMaxScaler()
norm.fit(X_train[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]])
X_train[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]] = norm.transform(X_train[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]])
X_test[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]] = norm.transform(X_test[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]])
X_val[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]] = norm.transform(X_val[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]])


Saving interim data

In [None]:
df_interim = pd.DataFrame()
df_interim = df_interim.append(X_train)
df_interim = df_interim.merge(X_test, how='outer')
df_interim = df_interim.merge(X_val, how='outer')
df_interim.to_csv('../data/interim.csv', index=False)

## Modeling

We are going to use 3 models - Random Forest, Logistic Regression and Gradient Boosting.


In [None]:
lr_clf = LogisticRegression(random_state=seed)
rf_clf = RandomForestClassifier(random_state=seed)
gb_clf = GradientBoostingClassifier(random_state=seed)

For each model, we will tune its hyperparameters based on Bayesian Optimization. Also we use 5-fold Cross Validation.

In [None]:
lr_param_grid = {
    'fit_intercept': Categorical([True, False]),
    'C': Real(0.001, 1000),
    'penalty': Categorical(['l2', 'none'])
}

rf_param_grid = {
    'n_estimators': Integer(1, 1000),
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(1, 15),
    'max_features': Integer(3, X_train.shape[1]),
    'min_samples_leaf': Integer(5, 500)
}

gb_param_grid = {
    'n_estimators': Integer(1, 1000),
    'max_depth': Integer(1, 15),
    'learning_rate': Real(0.001, 100),
    'min_samples_leaf': Integer(5, 500),
    'max_features': Integer(3, X_train.shape[1])
}

lr_search = BayesSearchCV(
    estimator=lr_clf,
    search_spaces=lr_param_grid,
    n_iter=100,
    cv=5,
    random_state=seed,
    n_jobs=-1
)

rf_search = BayesSearchCV(
    estimator=rf_clf,
    search_spaces=rf_param_grid,
    n_iter=100,
    cv=5,
    random_state=seed,
    n_jobs=-1
)

gb_search = BayesSearchCV(
    estimator=gb_clf,
    search_spaces=gb_param_grid,
    n_iter=100,
    cv=5,
    random_state=seed,
    n_jobs=-1
)

Fitting the models

In [None]:
lr_search.fit(X_train, y_train)
rf_search.fit(X_train, y_train)
gb_search.fit(X_train, y_train)


Now we have the models with best parametrs. In next step are pirnted the best models for each type of the model with it's best hyperparametrs and with score.

In [None]:
print('Logistic Regression:')
print('Best model:', lr_search.best_estimator_)
print('Best score:', lr_search.best_score_)
print('Random Forest:')
print('Best model:', rf_search.best_estimator_)
print('Best score:', rf_search.best_score_)
print('Gradient Boosting:')
print('Best model:', gb_search.best_estimator_)
print('Best score:', gb_search.best_score_)

Finally every of the best models is fitted to the validation data a is printed accuracy score for each of them. 

The best models are Logistics Regression and Gradient Boosting, which gave us 100% accuracy on the validation dataset.

In [None]:
best_models = {
    'Logistic Regression': lr_search.best_estimator_,
    'Random Forest': rf_search.best_estimator_,
    'Gradient Boosting': gb_search.best_estimator_
}

for name, model in best_models.items():
    model.fit(X_train_val, y_train_val)
    score = model.score(X_test, y_test)
    print(f'{name}: {score}')

# Evaluation of logistic regresion

As can be seen below in the confusion matrix, our Logistic Regrassion model indeed classifies and distinguishes between every of the species.

In [None]:
confm = pd.DataFrame(confusion_matrix(y_val, lr_search.best_estimator_.predict(X_val))).rename(
                                        columns = {0: 'Predicted - Adelie',1: 'Predicted - Chinstrap',2: 'Predicted - Gentoo'},
                                        index = {0: 'Actual - Adelie',1: 'Actual - Chinstrap',2: 'Actual - Gentoo'})

sns.heatmap(confm, annot = True, cmap ='Wistia', fmt = 'g')
plt.show()