In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import warnings
import sys
import os

# Add src to path to import our data loader
sys.path.append('../src')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

print(" Libraries imported successfully!")
print(f" Pandas version: {pd.__version__}")
print(f" NumPy version: {np.__version__}")

 Libraries imported successfully!
 Pandas version: 2.3.3
 NumPy version: 2.3.5


In [31]:
# Cell 2: Load Data Using Custom Data Loader
from data_loader import load_insurance_data

print(" Loading insurance data...")

# Load the data (use sample_size=10000 for testing if dataset is large)
# Remove sample_size parameter for full dataset
try:
    # Try with full dataset first
    df = load_insurance_data(
        data_path='../data/raw/MachineLearningRating_v3.txt',
        clean=True,
        convert_dtypes=True
    )
    print(f" Data loaded successfully!")
    
except Exception as e:
    print(f"  Error loading full dataset: {e}")
    print(" Trying with sample for testing...")
    # Fallback to sample
    df = load_insurance_data(
        data_path='../data/raw/insurance_data.txt',
        sample_size=10000,
        clean=True,
        convert_dtypes=True
    )
    print(f" Sample data loaded successfully!")

print(f"\n Dataset loaded:")
print(f"   ‚Ä¢ Rows: {df.shape[0]:,}")
print(f"   ‚Ä¢ Columns: {df.shape[1]:,}")
print(f"   ‚Ä¢ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

 Loading insurance data...
üìÅ Loading data from: ../data/raw/MachineLearningRating_v3.txt
 Loaded full dataset: 1,000,098 rows, 52 columns

  BASIC DATA INFORMATION:
   Shape: (1000098, 52)
   Memory usage: 2373.96 MB
   Time period: 2013-10-01 00:00:00 to 2015-08-01 00:00:00
   Numeric columns: 15
   Categorical columns: 36

 Cleaning column names...
 Column names cleaned. Examples:
   UnderwrittenCoverID ‚Üí underwrittencoverid
   PolicyID ‚Üí policyid
   TransactionMonth ‚Üí transactionmonth
   IsVATRegistered ‚Üí isvatregistered
   Citizenship ‚Üí citizenship

 Converting data types...
    transactionmonth converted to datetime
 Data type conversion complete
 Data loaded successfully!

 Dataset loaded:
   ‚Ä¢ Rows: 1,000,098
   ‚Ä¢ Columns: 52
   ‚Ä¢ Memory usage: 2309.1 MB
