# Assignment 1
Authors:
- Igor Ordecha 251601 
- Adrian Urba≈Ñczyk 252960

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations, chain

In [None]:
column_names = [
    'sex',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
    'rings'
]
df = pd.read_csv('data.csv', header=None, names=column_names)

## 1. Table of the distribution of the qualitative variable

In [None]:
sex_dict = {'M':'Male', 'F':'Female', 'I':'Infant'}
sexes = df.groupby('sex').size().reset_index(name='count')
sexes['sex'] = sexes['sex'].map(sex_dict)
sexes['%'] = np.round(
    (sexes['count'] / sexes['count'].sum()) * 100, 
    2
)
sexes = sexes.set_index('sex')
sexes.index.name = None
sexes

## 2.  Table with summary statistics for the quantitative variables

In [None]:
table = df.describe().transpose()[["mean", "std", "min", "25%", "50%", "75%", "max"]].reset_index()
table['index'] = table['index'].str.capitalize()

table.set_index('index', inplace=True)
table.index.name = None
table


## 3. Bar chart of the counts of occurrences of each category for the qualitative variable

In [None]:
plt.bar(sexes.reset_index()["index"], sexes["count"])
plt.show()

## 4. Histogram of each quantitative variable

In [None]:
fig, axs = plt.subplots(4,2, figsize=(12, 16))
axs[0, 0].hist(df['length'], bins=10, color='blue', edgecolor='white')
axs[0, 0].set_xlabel('Length Distribution')

axs[0, 1].hist(df['diameter'], bins=10, color='green', edgecolor='white')
axs[0, 1].set_xlabel('Diameter Distribution')

axs[1, 0].hist(df['height'], bins=10, color='red', edgecolor='white')
axs[1, 0].set_xlabel('Height Distribution')

axs[1, 1].hist(df['whole_weight'], bins=10, color='purple', edgecolor='white')
axs[1, 1].set_xlabel('Whole Weight Distribution')

axs[2, 0].hist(df['shucked_weight'], bins=10, color='orange', edgecolor='white')
axs[2, 0].set_xlabel('Shucked Weight Distribution')

axs[2, 1].hist(df['viscera_weight'], bins=10, color='brown', edgecolor='white')
axs[2, 1].set_xlabel('Viscera Weight Distribution')

axs[3, 0].hist(df['shell_weight'], bins=10, color='pink', edgecolor='white')
axs[3, 0].set_xlabel('Shell Weight Distribution')

axs[3, 1].hist(df['rings'], bins=10, color='gray', edgecolor='white')
axs[3, 1].set_xlabel('Rings Distribution')

plt.show()

## 5. Scatter plot for each pair of the quantitative variables

In [None]:
fig, axs = plt.subplots(14, 2, figsize=(12, 42))
flat_axs = chain.from_iterable(axs)
pairs = combinations(df.select_dtypes(include=[np.number]).columns, 2)

for (col1, col2), ax in zip(pairs, flat_axs):
    ax.scatter(df[col1], df[col2], s=5)
    ax.set_xlabel(col1)
    ax.set_ylabel(col2)
    ax.text(0.5, -0.3, f'{col1} vs {col2}', transform=ax.transAxes, ha='center')

plt.tight_layout()
plt.show()

## 6. Table representing a linear correlation matrix of all quantitative variables

In [None]:
corr = df.select_dtypes(include=[np.number]).corr()
corr

## 7. Heatmap representing a linear correlation matrix of all quantitative variables

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr, 
            annot=True, 
            cmap='coolwarm', 
            center=0.5,
            fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

## 8. Linear regression plot with the two quantitative variables that are most strongly linearly correlated

In [None]:
sns.regplot(data=df, x='diameter', y='length', scatter_kws={'s':10}, line_kws={'color':'orange'})
plt.title('Linear Regression: Diameter vs Length')
plt.xlabel('Diameter')
plt.ylabel('Length')
plt.show()

## Tasks for grade 5

## Table with summary statistics for the quantitative variables

In [None]:
summary = df.copy()

summary["sex"] = summary["sex"].map(sex_dict)

summary = summary.groupby('sex').describe().stack(level=0, future_stack=True).swaplevel().sort_index()

summary.index.names = ['Feature', 'Sex']

summary[["mean", "std", "min", "25%", "50%", "75%", "max"]]

## Boxplot of each quantitative variable

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(12, 16))
flat_axs = chain.from_iterable(axs)
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col, ax in zip(numeric_cols, flat_axs):
    df.boxplot(column=col, by='sex', ax=ax)
    ax.set_title('')
    ax.text(0.5, -0.2, f'{col} by sex', transform=ax.transAxes, ha='center')
    ax.set_xlabel('Sex')
    ax.set_ylabel(col)

plt.suptitle('')
plt.tight_layout()
plt.show()