In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols

In [60]:
# LOAD DATA
data = pd.read_csv("penguins.csv")
data

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [79]:
# CLEANING THE VARIABLES
clean_data = data.dropna()
clean_data

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [89]:
data2 = clean_data[["species","bill_length_mm","body_mass_g","sex"]]

In [90]:
data2

Unnamed: 0,species,bill_length_mm,body_mass_g,sex
0,Adelie,39.1,3750.0,male
1,Adelie,39.5,3800.0,female
2,Adelie,40.3,3250.0,female
4,Adelie,36.7,3450.0,female
5,Adelie,39.3,3650.0,male
...,...,...,...,...
339,Chinstrap,55.8,4000.0,male
340,Chinstrap,43.5,3400.0,female
341,Chinstrap,49.6,3775.0,male
342,Chinstrap,50.8,4100.0,male


In [91]:
data2.head(200)

Unnamed: 0,species,bill_length_mm,body_mass_g,sex
0,Adelie,39.1,3750.0,male
1,Adelie,39.5,3800.0,female
2,Adelie,40.3,3250.0,female
4,Adelie,36.7,3450.0,female
5,Adelie,39.3,3650.0,male
...,...,...,...,...
202,Gentoo,46.6,4850.0,female
203,Gentoo,48.5,5300.0,male
204,Gentoo,45.1,4400.0,female
205,Gentoo,50.1,5000.0,male


In [92]:
# SPLITTING THE DATA INTO TRAINING AND TESTING (HOLD OUT)
data_x = data[["bill_length_mm","sex","species"]]
data_y = data[["body_mass_g"]]

In [93]:
# TRAINING TEST DATA
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.3,random_state=42)
ols_formula = "body_mass_g ~ bill_length_mm + C(sex) + C(species)"
ols_data = pd.concat([x_train,y_train],axis=1)
OLS = ols(formula =ols_formula, data = ols_data)
model = OLS.fit()
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.866
Model:,OLS,Adj. R-squared:,0.863
Method:,Least Squares,F-statistic:,367.0
Date:,"Tue, 27 Aug 2024",Prob (F-statistic):,4.48e-98
Time:,18:23:57,Log-Likelihood:,-1663.1
No. Observations:,233,AIC:,3336.0
Df Residuals:,228,BIC:,3353.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1968.1372,322.608,6.101,0.000,1332.462,2603.812
C(sex)[T.male],552.3007,52.714,10.477,0.000,448.432,656.169
C(species)[T.Chinstrap],-288.1452,102.475,-2.812,0.005,-490.065,-86.225
C(species)[T.Gentoo],1068.7972,85.792,12.458,0.000,899.751,1237.844
bill_length_mm,36.9026,8.658,4.262,0.000,19.843,53.962

0,1,2,3
Omnibus:,1.676,Durbin-Watson:,1.775
Prob(Omnibus):,0.433,Jarque-Bera (JB):,1.55
Skew:,0.092,Prob(JB):,0.461
Kurtosis:,2.646,Cond. No.,756.0


In [None]:
# INTERPRETATION OF THE RESULTS
# Dependent Variable: body_mass_g (the body mass in grams)
# Independent Variables: bill_length_mm (bill length in millimeters), C(sex) (categorical variable for sex), C(species) (categorical variable for species)
# R-squared: 0.866: This indicates that approximately 86.6% of the variance in body_mass_g can be explained by the independent variables in the model.
# This high R-squared value suggests that the model fits the data well.
# Adjusted R-squared: 0.863: This value adjusts the R-squared for the number of predictors in the model and the sample size.
# It is slightly lower than the R-squared, which is typical when adding more variables.
# F-statistic: 367.0, Prob (F-statistic): 4.48e-98: The F-statistic tests whether at least one of the coefficients is different from zero.
# The very low p-value (close to zero) indicates that the overall model is statistically significant.
# Coefficients:
#1. Intercept: 1968.1372
# This is the estimated body mass (in grams) when all other variables are zero (baseline).
# The intercept is significant with a p-value of 0.000, suggesting that it is a meaningful component of the model.
#2. C(sex)[T.male]: 552.3007
# Being male is associated with an increase in body mass of approximately 552 grams, compared to the baseline (female). 
# The p-value is 0.000, indicating that this effect is statistically significant.
#3.C(species)[T.Chinstrap]: -288.1452
# Belonging to the Chinstrap species is associated with a decrease in body mass of approximately 288 grams, compared to the baseline species (likely Adelie, not shown).
# This effect is significant with a p-value of 0.005.
#4. C(species)[T.Gentoo]: 1068.7972
# Belonging to the Gentoo species is associated with an increase in body mass of approximately 1069 grams, compared to the baseline species.
# This effect is highly significant with a p-value of 0.000.
#5. bill_length_mm: 36.9026
# For every additional millimeter of bill length, the body mass increases by approximately 37 grams. The p-value is 0.000, indicating a statistically significant relationship.
## Summary:
# The regression model shows that bill_length_mm, sex, and species are all significant predictors of body_mass_g. 
# Males tend to have a higher body mass than females, and the Gentoo species have a significantly higher body mass compared to other species.
# The model fits the data well, as indicated by a high R-squared value. Residuals appear to be normally distributed, and there is no significant evidence of autocorrelation. 
# However, the high condition number suggests checking for multicollinearity issues. Overall, the model provides valuable insights into the factors influencing body mass in this dataset.
# The model coefficients from the regression results are as follows:

#1. Intercept: 1968.1372
#2. C(sex)[T.male]: 552.3007
#3. C(species)[T.Chinstrap]: -288.1452
#4. C(species)[T.Gentoo]: 1068.7972
#5. bill_length_mm: 36.9026
