In [None]:
# Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Load the dataset
url = 'https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins.csv'
penguins = pd.read_csv(url)
df = penguins.copy()

- Data Cleaning and Exploratory Analysis

In [None]:
# we have 344 rows, 8 columns. we have missings. Maybe i'll change year type to datetime
df.info()

In [None]:
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year

In [None]:
df.info()

In [None]:
df.head(20) 

In [None]:
df.hist(figsize=(15,15), bins=20)
plt.show()

In [None]:
df.isnull().sum()

In [None]:
rows_with_missing = df[df.isnull().any(axis=1)]
print(rows_with_missing)

In [None]:
# i'll drop rows with index 3 and 271
df = df.drop([3,271])
print(df.isnull().sum())

In [None]:
# i'll drop rows 9 rows with missings
df = df.dropna()
df.isnull().sum()

In [None]:
# ## Plot

# sns.distplot(df3.rate)
# plt.axvline(UC, color='r')
# plt.axvline(LC, color ='r')
# plt.axvline(Q1, color='g')
# plt.axvline(Q3, color='g')
# plt.show()

 - Visualization

In [None]:
# The number of Adelie penguins prevails over other specie. Almost equal numbers male/female penguins. 
species_distribution = sns.countplot(
    data = df, x = 'species', hue = 'sex')
species_distribution.set_title('Species distribution by sex')
plt.show()

In [None]:
# i see correlation between flipper length and body mass. Gentoo penguins is biggest species. 
flipper_length = sns.scatterplot(
data = df, x='flipper_length_mm', y='body_mass_g', hue='species'
)
flipper_length.set_title('Flipper Length vs.Body Mass by species')
plt.show()

In [None]:
# most of penguines live in Biscoe and Dream islands. 


# Calculate count of penguins on each island
penguins_per_island = df['island'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 6))
plt.pie(penguins_per_island, labels=penguins_per_island.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Penguins on Each Island')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

In [None]:
# i сofirmed correlation between flipper length and body mass. I think we can see correlation between bill length and flipper length. 

# Exclude non-numeric columns from correlation calculation
numeric_columns = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix
correlation_matrix = numeric_columns.corr()
sns.set_theme()
cor_df = sns.heatmap(correlation_matrix, annot=True, linewidths=.5, cmap='Blues')
cor_df.set_title('Heatmap of Correlations')
plt.show()

In [None]:
# we have outliers in our dataset
for col in numeric_columns:
    box_col = sns.boxplot(data = df, x=col, hue='species')
    box_col.set_title('checking for outliers in ' + col)
    plt.show()

In [None]:
# in numeric columns we dont have a normal destribution
for col in numeric_columns:
    hist_plot = sns.histplot(data = df, x=col, kde=True)
    hist_plot.set_title('test ' + col)
    plt.show()

In [None]:
# its good option to visualize pairwise relationships in the dataset
sns.set_theme(style="ticks")
sns.pairplot(df, hue="species")
plt.show()

# Conclusion

Dataset Overview: The dataset comprises 344 rows and 8 columns.

1. Data Preparation:
- Missing Values: The dataset contains missing values which need to be addressed.
- Data Type Adjustment: Consider converting the 'year' column to datetime format for easier manipulation.
- Data Cleaning: Rows with index 3 and 271, as well as 9 rows with missing values, will be dropped.

2. Species Distribution:
- Adelie penguins are the predominant species in the dataset, with a nearly equal distribution between male and female individuals.
- Gentoo penguins are identified as the largest species.

3. Island Distribution:
- Biscoe and Dream islands harbor the majority of penguins, indicating these locations are the most populated habitats.

4. Correlation Analysis:
- A positive correlation is observed between flipper length and body mass.
- Potential correlations between bill length and flipper length are suggested for further exploration.

5. Outlier Detection:
- Outliers are noted within the dataset, warranting further investigation.