## Data Visualization of Penguin Dataset

In [1]:
# lib manipulasi data
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# lib visualisasi data
import seaborn as sns
import matplotlib.pyplot as plt

### 1. Data Acquisition

In [2]:
# load dataset
dataset = sns.load_dataset("penguins")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [3]:
# analysis statistic
print(np.round(dataset.describe().transpose()))

                   count    mean    std     min     25%     50%     75%     max
bill_length_mm     342.0    44.0    5.0    32.0    39.0    44.0    48.0    60.0
bill_depth_mm      342.0    17.0    2.0    13.0    16.0    17.0    19.0    22.0
flipper_length_mm  342.0   201.0   14.0   172.0   190.0   197.0   213.0   231.0
body_mass_g        342.0  4202.0  802.0  2700.0  3550.0  4050.0  4750.0  6300.0


In [4]:
# show dataset
print(dataset.tail())

    species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g     sex
339  Gentoo  Biscoe             NaN            NaN                NaN          NaN     NaN
340  Gentoo  Biscoe            46.8           14.3              215.0       4850.0  Female
341  Gentoo  Biscoe            50.4           15.7              222.0       5750.0    Male
342  Gentoo  Biscoe            45.2           14.8              212.0       5200.0  Female
343  Gentoo  Biscoe            49.9           16.1              213.0       5400.0    Male


### 2. Preprocessing Data

In [5]:
# check missing values
dataset.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
# drop missing values
dataset.dropna(inplace=True)

In [7]:
# check unique values
dataset["species"].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [8]:
# check unique values
dataset["island"].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

### 3. Exploration Data Analysis

In [9]:
df = dataset.groupby(by=["sex"])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].aggregate("mean").reset_index()
df = np.round(df,2)
print(df)

      sex  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
0  Female           42.10          16.43             197.36      3862.27
1    Male           45.85          17.89             204.51      4545.68
