In [None]:
# Enable autoreload so changes in src/ are reflected immediately
%load_ext autoreload
%autoreload 2

import sys
import os
# Add the src directory to system path
sys.path.append(os.path.abspath(os.path.join('../src')))

from data_loader import InsuranceDataLoader
from eda_analyzer import InsuranceEDA
from plotter import InsurancePlotter

#### Cell 2: Loading Data

# Initialize Loader
loader = InsuranceDataLoader('../data/raw/insurance_data.csv') # Make sure path is correct
df = loader.load_data()

# Optimize Types (Dates, etc)
df = loader.optimize_types()

df.head()

#### Cell 3: Data Understanding & Quality

# Initialize EDA Analyzer
analyzer = InsuranceEDA(df)

# Check Missing Values
missing_stats = analyzer.check_missing_values()
print("Missing Values Summary:\n", missing_stats)

# Get Summary Stats
stats = analyzer.get_data_summary()
print("Descriptive Statistics:\n", stats)

#### Cell 4: Univariate Analysis (Histograms)

# Initialize Plotter
plotter = InsurancePlotter(df)

# Plot Premium Distribution
plotter.plot_histogram('TotalPremium', 'Distribution of Total Premium', color='green')

# Plot Claims Distribution (Note: highly skewed, maybe filter > 0)
plotter.plot_histogram('TotalClaims', 'Distribution of Total Claims', color='red')

#### Cell 5: Bivariate Analysis & Creative Plots

#**Creative Plot 1: Premium vs Claims by ZipCode**
#This answers the "Trends Over Geography" question.

# Aggregate data by PostalCode (ZipCode)
geo_df = analyzer.aggregate_by_geography('PostalCode')

# Filter for meaningful visualization (e.g., remove zipcodes with almost no data)
geo_df = geo_df[geo_df['TotalPremium'] > 1000]

# Scatter plot
plotter.plot_scatter_geo_risk(geo_df, 'TotalPremium', 'TotalClaims', hue_col='TotalClaims')

#**Creative Plot 2: Outlier Detection (Box Plots)**
#This answers the "Outlier Detection" question using Province and Premium.


plotter.plot_box('Province', 'TotalPremium', 'Total Premium Distribution by Province')

#**Creative Plot 3: The Correlation Matrix**
#This answers the relationship between monthly changes.


# Select numerical columns for correlation
cols_to_corr = ['TotalPremium', 'TotalClaims', 'CalculatedPremiumPerTerm', 'SumInsured']
plotter.plot_correlation_heatmap(cols_to_corr)

### Summary of Workflow for You

#1.  **Setup:** Create the folders and `requirements.txt`.
#2.  **Paste Code:** Create the `.py` files in `src/` using the code blocks above.
#3.  **Git:** Run the `git init` and `git checkout -b task-1` commands.
#4.  **Install:** Run `pip install -r requirements.txt`.
#5.  **Analyze:** Create the notebook and run the analysis code.
#6.  **Push:** Push to GitHub. The Action in `.github/workflows` will automatically run and verify your setup.

#This modular structure ensures you hit the "Modular and object-oriented Python code writing" KPI perfectly.