In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [None]:
penguins = sns.load_dataset('penguins')

print("Sample of Penguins Dataset:")
print(penguins.head())


In [None]:
print("Missing values in each column:")
print(penguins.isnull().sum())


In [None]:
df = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm',
               'body_mass_g', 'species', 'sex']].copy()

print("Selected columns:")
print(df.head())


In [None]:
num_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

imputer_num = SimpleImputer(strategy='mean')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

print("After handling missing numerical values:")
print(df.head())


In [None]:
cat_cols = ['species', 'sex']

imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

print("After handling missing categorical values:")
print(df.head())


In [None]:
encoder_species = LabelEncoder()
encoder_sex = LabelEncoder()

df['species'] = encoder_species.fit_transform(df['species'])
df['sex'] = encoder_sex.fit_transform(df['sex'])

print("After encoding categorical values:")
print(df.head())


In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("After feature scaling:")
print(df.head())


In [None]:
plt.figure(figsize=(8, 6))
plt.hist(df['bill_length_mm'], bins=20, edgecolor='black')
plt.title('Distribution of Bill Length (Standardized)')
plt.xlabel('Bill Length (Standardized)')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='bill_length_mm', y='flipper_length_mm', hue='species', data=df)
plt.title('Scatter Plot of Bill Length vs Flipper Length')
plt.xlabel('Bill Length (Standardized)')
plt.ylabel('Flipper Length (Standardized)')
plt.show()


In [None]:
correlation_matrix = df.corr()

print("Correlation Matrix:")
print(correlation_matrix)

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap (Penguins Dataset)')
plt.show()
