In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv('Palmer_Archipelago_Penguin_Data_size.csv')
datac = data.copy() 
data.head()


In [None]:
data.describe().T


In [None]:
data.info()

In [None]:
for col in data.select_dtypes(include='object').columns:
    print(data[col].value_counts())
    print('\n')
    

In [6]:
data['sex'] = data['sex'].replace('.', None)

In [None]:
data.isna().sum()

In [None]:
print(f'duplicated values: {data.duplicated().sum().item()}')

In [None]:
data.describe(include='all').T


## Clean

In [10]:
filtered_data = data.dropna()

In [None]:
print(filtered_data.isna().sum())

filtered_data.T

In [14]:
filtered_data.to_csv('filtered_data.csv', index=False)

In [None]:
print(filtered_data.species.unique())
print(filtered_data.island.unique())
print(filtered_data.sex.unique())

# Data analysis

The categorical value:

- species	
- island
- sex

The numeric value:

- culmen_length_mm	
- culmen_depth_mm	
- flipper_length_mm	
- body_mass_g

In [57]:
categorical_vars = ['species', 'island','sex']
numerical_vars = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [58]:
def plotHisto(data, variable, group='species'):
    plt.figure()
    for species in data[group].unique():
        subset = data[data[group] == species]
        plt.hist(subset[variable], bins=100, alpha=0.5, label=species)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("Penguin - {}" .format(variable))
    plt.legend()
    plt.show()

def plotBar(data, variable, n=5):

    plt.figure()
    sns.countplot(data=data, x=variable, order=data[variable].value_counts().index[:n], palette="Set2")
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("Penguin - {}" .format(variable))
    plt.xticks(rotation=45)
    plt.show()

def showData(data, variable):
    print(data[variable].describe())
    
    
def boxPlotMethod(data, variable, group='species'):
    sns.boxplot(x=group, y=variable, data=data, hue=group, palette="Set2", legend=False)
    plt.title(f'Boxplot of {variable} by {group}')
    plt.show()


In [None]:
for var in categorical_vars:
    showData(filtered_data, var)
    plotBar(filtered_data, var)


In [None]:
sns.pairplot(filtered_data, hue="species", size=3,diag_kind="hist", palette="Set2")

In [None]:
for var in numerical_vars:
    boxPlotMethod(filtered_data, var)

# Preprocessing

In [None]:
correlation_matrix = filtered_data[numerical_vars].corr()
print(correlation_matrix)

# Optionally, you can visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='crest', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


### Encoding

In [None]:
encoder = LabelEncoder()

dict_vars_encoded = dict()

for var in categorical_vars:
    # if var != 'species':
    filtered_data[var] = encoder.fit_transform(filtered_data[var])
    dict_vars_encoded[var] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

print(dict_vars_encoded)
filtered_data.T

In [None]:
dict_species = dict_vars_encoded['species']
dict_island = dict_vars_encoded['island']
dict_sex = dict_vars_encoded['sex']

print(dict_species)
print(dict_island)
print(dict_sex)

### Divid input & output¶


In [65]:
X, y = filtered_data.drop('species', axis=1), filtered_data['species']


### Scaling

In [66]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)


### Split Train Test

In [67]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

# MODEL

In [69]:
# model_1 = 
model_1 = LogisticRegression()
model_2 = SGDRegressor()
model_3 = LinearRegression()
model_4 = Ridge()
model_5 = ElasticNet()
model_6 = SVR()
model_7 = KNeighborsRegressor(n_neighbors=3)
model_8 = DecisionTreeRegressor()
model_9 = RandomForestRegressor()
model_10 = BaggingRegressor()
model_11 = ExtraTreesRegressor()
model_12 = AdaBoostRegressor()
model_13 = XGBRegressor()
model_14 = CatBoostRegressor()
model_15 = LGBMRegressor()

### FIT

In [None]:
model_1.fit(X_train,y_train)


In [None]:
model_2.fit(X_train,y_train)


In [None]:
model_3.fit(X_train,y_train)


In [None]:
model_4.fit(X_train,y_train)


In [None]:
model_5.fit(X_train,y_train)


In [None]:
model_6.fit(X_train,y_train)


In [None]:
model_7.fit(X_train,y_train)    


In [None]:
model_8.fit(X_train,y_train)


In [None]:
model_9.fit(X_train,y_train)


In [None]:
model_10.fit(X_train,y_train)


In [None]:
model_11.fit(X_train,y_train)


In [None]:
model_12.fit(X_train,y_train)

In [None]:
model_13.fit(X_train,y_train)

In [None]:
model_14.fit(X_train,y_train)

In [None]:
model_15.fit(X_train,y_train)

## Model Comparition

In [None]:
# List of models:
models = [model_1, model_2, model_3, model_4, model_5,
          model_6, model_7, model_8, model_9, model_10,
          model_11, model_12, model_13, model_14, model_15]
models_names = ['LogisticRegression', 'SGDRegressor', 'LinearRegression', 'Ridge', 'ElasticNet', 'SVR', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor', 'BaggingRegressor', 'ExtraTreesRegressor', 'AdaBoostRegressor', 'XGBRegressor', 'CatBoostRegressor', 'LGBMRegressor']

# models = [model_1, model_7]
# models_names = ['logisticRegression', 'KNeighborsRegressor']


# Calculate predictions and squared errors for each model:
squared_errors = []
for model in models:
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # mse = mean_squared_error(y_test, y_pred)
    squared_errors.append(f'{mse * 100:.2f}%')  # Format as percentage

# Calculate train and test scores:
train_score = [model.score(X_train, y_train) for model in models]
test_score = [model.score(X_test, y_test) for model in models]

# Difference between training and testing ratio
ratio = []
for train, test in zip(train_score, test_score):
    result = train - test
    ratio.append(f'{result * 100:.2f}%')

# Measure model state:6
rate = []
for train, test in zip(train_score, test_score):
    if train <= 0.65 and test <= 0.65:
        rate.append('bad')
    elif train > test * 1.10:
        rate.append('overfite')
    elif train > 0.65 and train < 0.80 and test > 0.65 and test < 0.80:
        rate.append('middle')
    elif train >= 0.80 and test >= 0.80 and train < 1.00 and test < 1.00:
        rate.append('good')
    elif train >= 0.80 and test < 0.80:
        rate.append('high train, low test')
    else:
        rate.append('unknown')

# Create DataFrame
model_score = pd.DataFrame({
    'Model': models_names,
    'Train score': [f'{round(score * 100, 2)}%' for score in train_score],
    'Test score': [f'{round(score * 100, 2)}%' for score in test_score],
    'Ratio difference': ratio,
    'Evaluate model': rate,
    'Squared error': squared_errors
})

# Show result:
model_score

## **Answer**:

Overall, Following models:
- KNeighborsRegressor
- RandomForestRegressor
- ExtraTreesRegressor
- catBoostRegressor 

Show the best performance with high scores and low squared errors.

In [None]:

# Create the directory if it doesn't exist
os.makedirs('./models', exist_ok=True)

# Save the models
joblib.dump(model_1, './models/LogisticRegression.pkl')
joblib.dump(model_7, './models/KNeighborsRegressor.pkl')
joblib.dump(model_9, './models/RandomForestRegressor.pkl')
joblib.dump(model_11, './models/ExtraTreesRegressor.pkl')

print("Models saved successfully.")

## Pipeline creation

# Test new data

In [None]:
# Load the models
logistic_regression_model = joblib.load('./models/LogisticRegression.pkl')
knn_model = joblib.load('./models/KNeighborsRegressor.pkl')
random_forest_model = joblib.load('./models/RandomForestRegressor.pkl')
extra_trees_model = joblib.load('./models/ExtraTreesRegressor.pkl')

# Create new test data
new_data = pd.DataFrame({
    'island': ['Biscoe', 'Torgersen', 'Biscoe'],
    'culmen_length_mm': [42.1, 39.0, 41.1],
    'culmen_depth_mm': [19.1, 21.3, 18.2],
    'flipper_length_mm': [195.0, 190.0, 192.0],
    'body_mass_g': [3000, 3700, 2500],
    'sex': ['MALE', 'FEMALE', 'MALE']
})

# Encode the categorical variables in the new data
for var in categorical_vars:
    if var != 'species':
        new_data[var] = new_data[var].map(dict_vars_encoded[var])

# Scale the new data using the already fitted scaler
new_data_scaled = scaler.transform(new_data)

# Make predictions with each model
logistic_regression_prediction = logistic_regression_model.predict(new_data_scaled)
knn_prediction = knn_model.predict(new_data_scaled)
random_forest_prediction = random_forest_model.predict(new_data_scaled)
extra_trees_prediction = extra_trees_model.predict(new_data_scaled)

# Print predictions
print("LogisticRegression Predictions:", logistic_regression_prediction)
print("KNeighborsRegressor Predictions:", knn_prediction)
print("RandomForestRegressor Predictions:", random_forest_prediction)
print("ExtraTreesRegressor Predictions:", extra_trees_prediction)

## Test Existing Data

In [None]:
# Select a random sample from the cleaned data
random_sample = filtered_data.sample(n=10, random_state=1)
print("Random Sample:")
print(random_sample)

# Separate the features and target variable
random_sample_X = random_sample.drop('species', axis=1)
random_sample_y = random_sample['species']


random_sample_X

# Fill missing values with the mean of the column
# random_sample_X = random_sample_X.fillna(random_sample_X.mean())

# Scale the random sample using the already fitted scaler
random_sample_scaled = scaler.transform(random_sample_X)

# Make predictions with each model
logistic_regression_prediction = logistic_regression_model.predict(random_sample_scaled)
knn_prediction = knn_model.predict(random_sample_scaled)
random_forest_prediction = random_forest_model.predict(random_sample_scaled)
extra_trees_prediction = extra_trees_model.predict(random_sample_scaled)

# Print predictions
print("LogisticRegression Prediction:", logistic_regression_prediction)
print("KNeighborsRegressor Prediction:", knn_prediction)
print("RandomForestRegressor Prediction:", random_forest_prediction)
print("ExtraTreesRegressor Prediction:", extra_trees_prediction)

# Print the actual species
print("Actual Species:", random_sample_y.values)