In [1]:
# Kennedi Todd
# July 31, 2024
# Logistic regression, supervised learning
# Classify penguin based on variables
# https://scatterplotpress.teachable.com/p/ml-code-exercises

# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# read data
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')
df.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
# drop row with missing values
df.dropna(axis = 0,        # 0 = row, 1 = column
          how = 'any',     # drop rows if at least one value missing
          subset = None,   # use all columns
          inplace = True)  # update rather than replace

# one-hot encoding
df = pd.get_dummies(df, columns = ['island','sex']) # species is the dependent var
df.head()


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE
0,Adelie,39.1,18.7,181.0,3750.0,False,False,True,False,True
1,Adelie,39.5,17.4,186.0,3800.0,False,False,True,True,False
2,Adelie,40.3,18.0,195.0,3250.0,False,False,True,True,False
4,Adelie,36.7,19.3,193.0,3450.0,False,False,True,True,False
5,Adelie,39.3,20.6,190.0,3650.0,False,False,True,False,True


In [6]:
# assign vars
X = df.drop('species', axis = 1)
y = df['species']

# split into testing/training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)

# assign algorithm
model = LogisticRegression()
model.fit(X_train, y_train)

# run to make prediction
model_test = model.predict(X_test)

# evaluate predictions
print(confusion_matrix(y_test, model_test))
print(classification_report(y_test, model_test))


[[47  0  0]
 [ 1 12  0]
 [ 0  0 40]]
              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        47
   Chinstrap       1.00      0.92      0.96        13
      Gentoo       1.00      1.00      1.00        40

    accuracy                           0.99       100
   macro avg       0.99      0.97      0.98       100
weighted avg       0.99      0.99      0.99       100



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# predict data point
penguin = [
    39,   # bill length
    18.5, # bill depth
    180,  # flipper length
    3750, # body mass
    0,    # island Biscoe
    0,    # island Dream
    1,    # island Torgersen
    1,    # male
    0,    # female
]
predict_penguin = model.predict([penguin])
predict_penguin




array(['Adelie'], dtype=object)