In [1]:
import pandas as pd
import seaborn as sns

In [5]:
p_df = sns.load_dataset("penguins")
p_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [10]:
p_df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [21]:
#clean data
p_df = p_df.dropna()
p_df = p_df.drop_duplicates()

In [29]:
#X-Y split
X = p_df[["flipper_length_mm", "bill_length_mm"]]
y = p_df[["body_mass_g"]]

In [30]:
#dropping duplicates
from sklearn.model_selection import train_test_split

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)


In [88]:
X_test

Unnamed: 0,flipper_length_mm,bill_length_mm
30,178.0,39.5
317,222.0,46.9
79,195.0,42.1
201,198.0,49.8
63,192.0,41.1
...,...,...
247,215.0,47.8
122,176.0,40.2
146,190.0,39.2
182,187.0,40.9


In [89]:
y_train


Unnamed: 0,body_mass_g
321,5600.0
265,4900.0
36,3950.0
308,4875.0
191,4500.0
...,...
194,3550.0
77,3900.0
112,3200.0
277,5000.0


In [90]:
y_test

Unnamed: 0,body_mass_g
30,3250.0
317,4875.0
79,4000.0
201,3675.0
63,4050.0
...,...
247,5650.0
122,3450.0
146,4250.0
182,3200.0


In [91]:
#model training
from sklearn.linear_model import LinearRegression

In [92]:
lm = LinearRegression()

In [93]:
model = lm.fit(X_train, y_train)

In [94]:
#this is the slope
model.coef_
# this answers the question, if I raise the body mass by 1, I would expect the flipper to increase by 50

array([[50.09782099,  3.88008833]])

In [95]:
#this is the intercept
model.intercept_

array([-6032.65146541])

In [96]:
#predict mass of peguin
#get a random row
random_penguin = X_test.sample()
random_penguin

Unnamed: 0,flipper_length_mm,bill_length_mm
61,195.0,41.3


In [97]:
#prediction for random penguin's weight
model.predict(random_penguin)

array([[3896.67127496]])

In [98]:
#create predictions for test set

y_pred = model.predict(X_test)

In [99]:
y_pred = pd.DataFrame(y_pred)
y_pred

Unnamed: 0,0
0,3038.024159
1,5271.040936
2,3899.775346
3,4079.945489
4,3745.601794
...,...
79,4923.848269
80,2940.544579
81,3638.033985
82,3494.336672


In [100]:
y_test = y_test.reset_index(drop=True)

In [101]:
y_test

Unnamed: 0,body_mass_g
0,3250.0
1,4875.0
2,4000.0
3,3675.0
4,4050.0
...,...
79,5650.0
80,3450.0
81,4250.0
82,3200.0


In [102]:
residuals_df = pd.concat([y_pred, y_test], axis = 1)

In [107]:
residuals_df

Unnamed: 0,0,body_mass_g
0,3038.024159,3250.0
1,5271.040936,4875.0
2,3899.775346,4000.0
3,4079.945489,3675.0
4,3745.601794,4050.0
...,...,...
79,4923.848269,5650.0
80,2940.544579,3450.0
81,3638.033985,4250.0
82,3494.336672,3200.0


In [110]:
residuals_df = residuals_df.rename(columns = {"body_mass_g": "y_test", 0: "y_pred"})

In [111]:
residuals_df["residuals"] = residuals_df["y_pred"] - residuals_df["y_test"]

In [112]:
residuals_df

Unnamed: 0,y_pred,y_test,residuals
0,3038.024159,3250.0,-211.975841
1,5271.040936,4875.0,396.040936
2,3899.775346,4000.0,-100.224654
3,4079.945489,3675.0,404.945489
4,3745.601794,4050.0,-304.398206
...,...,...,...
79,4923.848269,5650.0,-726.151731
80,2940.544579,3450.0,-509.455421
81,3638.033985,4250.0,-611.966015
82,3494.336672,3200.0,294.336672


In [113]:
#calculating metrics
mean_error = residuals_df["residuals"].mean()

In [114]:
mean_error

-3.9859515779792543

In [115]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae

In [116]:
#want to see how goo are your predictions to the real values
mse(y_test, y_pred)

136331.24766934908

In [117]:
mae(y_test, y_pred)

297.40331732775303

In [118]:
#root mean squared error 
#on average when we predict the mass of the penguin, we are this far off
import numpy as np
np.sqrt(mse(y_test, y_pred))

369.23061583426295