In [None]:
import pandas as pd

penguins = pd.read_csv("penguins_classification.csv")

# only keep the Adelie and Chinstrap classes
penguins = penguins.set_index("Species").loc[
    ["Adelie", "Chinstrap"]].reset_index()
culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"]
target_column = "Species"

We can quickly start by visualizing the feature distribution by class:

In [None]:
import matplotlib.pyplot as plt

for feature_name in culmen_columns:
    plt.figure()
    # plot the histogram for each specie
    penguins.groupby("Species")[feature_name].plot.hist(alpha=0.5, legend=True)
    plt.xlabel(feature_name)

We can observe that we have quite a simple problem. When the culmen
length increases, the probability that the penguin is a Chinstrap is closer
to 1. However, the culmen depth is not helpful for predicting the penguin
species.

For model fitting, we will separate the target from the data and
we will create a training and a testing set.

In [None]:
from sklearn.model_selection import train_test_split

penguins_train, penguins_test = train_test_split(penguins, random_state=2)

data_train = penguins_train[culmen_columns]
data_test = penguins_test[culmen_columns]

target_train = penguins_train[target_column]
target_test = penguins_test[target_column]

In [None]:
target_train[:5]

In [None]:
data_train[:5]


The linear regression that we previously saw will predict a continuous
output. When the target is a binary outcome, one can use the logistic
function to model the probability. This model is known as logistic
regression.

Scikit-learn provides the class `LogisticRegression` which implements this
algorithm.

In [None]:
import sklearn
sklearn.set_config(display="diagram")

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

logistic_regression = make_pipeline(
    StandardScaler(), LogisticRegression(penalty="none")
)
kk=logistic_regression.fit(data_train, target_train)
accuracy = logistic_regression.score(data_test, target_test)
print(f"Accuracy on test set: {accuracy:.3f}")

In [None]:
coefs = logistic_regression[-1].coef_[0]  # the coefficients is a 2d array
weights = pd.Series(coefs, index=culmen_columns)

In [None]:
coefs

In [None]:
weights

In [None]:
logistic_regression[-1].coef_

In [None]:
logistic_regression[-1].intercept_

In [None]:
data_train[:5]

In [None]:
target_train[:5]

In [None]:
logistic_regression.classes_

In [None]:
logistic_regression.predict_proba(data_train[:5])

In [None]:
logistic_regression.predict(data_train[:5])

In [None]:
import pickle
midterm_model = pickle.dumps(logistic_regression)

# open a file, where you want to store the data
file = open('final2024_model.pkl', 'wb')

# dump information to that file
pickle.dump(logistic_regression, file)

# close the file
file.close()


#### 채점은 이렇게 (채점용 데이터 이용)

In [None]:
import pickle

# open a file, where you stored the pickled data
file2 = open('final2024_model.pkl', 'rb')

# dump information to that file
restord_model = pickle.load(file2)

In [None]:
accuracy = restord_model.score(data_test, target_test)
print(accuracy)

In [None]:
restord_model.predict_proba(data_train[:5])