In [35]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  
import plotly.graph_objects as go  


In [36]:
df = pd.read_excel("Worksheet in Case Study question 2.xlsx")

df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [37]:
def check_df_info(df):
    total_rows = len(df)
    null_counts = df.isnull().sum()
    outlier_counts = []

    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower = q1 - 1.5 * iqr
            upper = q3 + 1.5 * iqr
            outliers = df[(df[col] < lower) | (df[col] > upper)][col]
            outlier_counts.append(len(outliers))
        else:
            outlier_counts.append(None)

    info_table = pd.DataFrame({
        'column': df.columns,
        'dtype': df.dtypes.values,
        'null_count': null_counts.values,
        'null_percent': (null_counts.values / total_rows * 100).round(2),
        'total_rows': total_rows,
        'outlier_count': outlier_counts
    })

    return info_table
    

In [38]:
summary = check_df_info(df)
summary

Unnamed: 0,column,dtype,null_count,null_percent,total_rows,outlier_count
0,months_as_customer,int64,0,0.0,1000,0.0
1,age,int64,0,0.0,1000,4.0
2,policy_number,int64,0,0.0,1000,0.0
3,policy_bind_date,datetime64[ns],0,0.0,1000,
4,policy_state,object,0,0.0,1000,
5,policy_csl,object,0,0.0,1000,
6,policy_deductable,int64,0,0.0,1000,0.0
7,policy_annual_premium,float64,0,0.0,1000,9.0
8,umbrella_limit,int64,0,0.0,1000,202.0
9,insured_zip,int64,0,0.0,1000,0.0


In [39]:
df[df["bodily_injuries"]==0]["injury_claim"].mean()

7350.735294117647

In [40]:
df[df["bodily_injuries"]==2]["injury_claim"].mean()

7918.704819277108

In [41]:
df[df["bodily_injuries"]==3]

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported


In [42]:
df[df["number_of_vehicles_involved"]==3]["vehicle_claim"].mean()

44083.37988826816

In [43]:
df.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [44]:
fig = px.histogram(df,x="total_claim_amount",color='number_of_vehicles_involved')
fig.show()

In [48]:
fig = px.histogram(df,x="total_claim_amount",color='fraud_reported')
fig.show()

In [45]:
df["total_claim_amount_234"]=df[df["number_of_vehicles_involved"]==1]["total_claim_amount"]

In [46]:
df

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,total_claim_amount_234
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,71610.0
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,5070.0
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,63400.0
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,6500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,38,941851,1991-07-16,OH,500/1000,1000,1310.80,0,431289,...,?,87200,17440,8720,61040,Honda,Accord,2006,N,87200.0
996,285,41,186934,2014-01-05,IL,100/300,1000,1436.79,0,608177,...,?,108480,18080,18080,72320,Volkswagen,Passat,2015,N,108480.0
997,130,34,918516,2003-02-17,OH,250/500,500,1383.49,3000000,442797,...,YES,67500,7500,7500,52500,Suburu,Impreza,1996,N,
998,458,62,533940,2011-11-18,IL,500/1000,2000,1356.92,5000000,441714,...,YES,46980,5220,5220,36540,Audi,A5,1998,N,46980.0
