In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [13]:
# reading the data set
df = pd.read_excel('input/palmer_penguins.xlsx')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [14]:
# remove rows containing NaN
print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(344, 8)
(333, 8)


In [15]:
# recode target variable's values
df['sex_recoded'] = df.sex.replace({'female' : 0 , 'male' : 1})
print(df.sex_recoded.unique())

[1 0]


In [16]:
# split the data into Predictor (X) and Target (y) variables
X = df[['bill_length_mm', 'flipper_length_mm']]
y = df.sex_recoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

In [17]:
# instantiate the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [18]:
# predicting using the trained model
y_pred = logreg.predict(X_test)
print(y_pred)

[1 0 1 1 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 1 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0
 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 0
 1 0 0 1 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1]


In [19]:
# getting the probabaility of predictions of test data
y_pred_probs = logreg.predict_proba(X_test)[:, 1]
print(y_pred_probs)

[0.53696623 0.49904979 0.70831769 0.60111777 0.63312428 0.25911212
 0.74755433 0.16281482 0.52072073 0.15620186 0.20418971 0.5322994
 0.54996995 0.58199268 0.26884024 0.77774306 0.25043486 0.67631546
 0.26604381 0.55978475 0.68839317 0.48970151 0.18138052 0.39507393
 0.64985774 0.35568214 0.51672924 0.27800481 0.35895997 0.35385221
 0.36107148 0.22793718 0.4465682  0.27680878 0.61012469 0.19311676
 0.90003408 0.40921199 0.28515993 0.90283695 0.20418971 0.62196808
 0.76642161 0.37011608 0.47302535 0.65723946 0.54996995 0.72755755
 0.49646621 0.16998138 0.73914007 0.62432154 0.42022965 0.19610573
 0.67787583 0.24318577 0.71426791 0.39128694 0.50341757 0.46936197
 0.24166017 0.40299478 0.52159677 0.63183698 0.35969611 0.28394522
 0.4186723  0.70732571 0.50680026 0.32176725 0.44541171 0.48831768
 0.6456758  0.29227123 0.41471016 0.19144514 0.29013814 0.20174197
 0.61502994 0.75653454 0.23516023 0.43863096 0.68717907 0.16341106
 0.23615775 0.3408835  0.54928452 0.68177198 0.21829883 0.33775