# Assignment

# 1. Imports

## 1.1 Packages

In [8]:
import pandas as pd
import pandera.pandas as pa

## 1.2 Options

In [9]:
DATA_PATH = "../data/01_raw"

In [10]:
pd.options.display.max_columns = 150

## 1.3 Dataset

In [11]:
df = pd.read_parquet(f"{DATA_PATH}/fremotor1prem0304.parquet")

In [12]:
df.head(5)

Unnamed: 0,IDpol,Year,DrivAge,DrivGender,MaritalStatus,BonusMalus,LicenceNb,PayFreq,JobCode,VehAge,VehClass,VehPower,VehGas,VehUsage,Garage,Area,Region,Channel,Marketing,PremWindscreen,PremDamAll,PremFire,PremAcc1,PremAcc2,PremLegal,PremTPLM,PremTPLV,PremServ,PremTheft,PremTot,IDpol_corrected
0,1000111.1,2003.0,44.0,F,Cohabiting,50.0,3.0,Half-yearly,Private employee,10.0,Cheaper,P10,Regular,Private+trip to office,Closed zbox,A2,Headquarters,A,M1,15.0,0.0,0.0,0.0,0.0,6.0,69.1,4.0,50.0,0.0,144.1,1000111.1
1,1000113.1,2003.0,26.0,F,Cohabiting,85.0,2.0,Annual,Other,8.0,Cheapest,P8,Regular,Private+trip to office,Opened collective parking,A7,Headquarters,A,M2,16.0,0.0,0.0,0.0,0.0,7.0,139.3,7.0,46.0,0.0,215.3,1000113.1
3,1000113.1,2003.0,27.0,F,Cohabiting,106.0,2.0,Half-yearly,Other,6.0,Cheaper,P11,Regular,Private+trip to office,Opened collective parking,A7,Headquarters,A,M2,60.0,0.0,0.0,0.0,0.0,17.0,453.6,24.0,57.0,0.0,611.6,1000113.1
4,1000173.1,2003.0,52.0,M,Cohabiting,50.0,2.0,Half-yearly,Private employee,2.0,Cheaper,P11,Regular,Private+trip to office,Closed zbox,A7,Headquarters,A,M1,15.0,125.0,5.0,0.0,47.0,9.0,111.2,5.0,48.0,50.0,415.2,1000173.1
5,1000173.101,2003.0,52.0,M,Cohabiting,50.0,2.0,Half-yearly,Private employee,1.0,Cheap,P13,Regular,Private+trip to office,Closed collective parking,A7,Headquarters,A,M3,24.0,167.0,8.0,0.0,0.0,11.0,132.8,7.0,48.0,90.0,487.8,1000173.101


In [13]:
df["PayFreq"].value_counts(dropna=False)

PayFreq
Half-yearly    15456
Annual          9551
Quarterly       2068
Monthly          733
Name: count, dtype: int64

# 2. Validate the dataframe

**Goal**: The goal of this section is to validate the dataframe we will use to train our model.

We want to validate the dataframe before training our model. Your goal is to make sure the columns will verify the following rules:
* **Year**: Check the year are between 2003 and 2004.
* **DrivAge**: Make sure the driver's age are possible (e.g. between 18 and 100).
* **DrivGender**: The gender is either 'M' or 'F'.
* **MaritalStatus**: Possible values "Cohabiting", "Married", "Single", "Widowed" or "Divorced".
* **BonusMalus**: The value of the bonus / malus is over 50.
* **LicenceNb**: The licence number is over 1.
* **JobCode**: The possible values are "Private employee", "Public employee", "Retiree", "Other", "Craftsman", "Farmer" or "Retailer",
* **VehAge**: Make sure the vehicule age is possible.
* **VehGas**: Either "Regular" or "Diesel".
* **Area**: Possible values are from A1 to A12 included.

In [14]:
schema_df = pa.DataFrameSchema({
    "IDpol": pa.Column(str),
    "PayFreq": pa.Column(str, checks=pa.Check.isin(["Annual", "Half-yearly", "Quarterly", "Monthly"])),
    "VehClass": pa.Column(str, checks=pa.Check.isin([
        "Cheapest", "Cheaper", "Cheap", "Medium low", "Medium", "Medium high", "Expensive", "More expensive", "Most expensive",
    ])),
    "VehPower": pa.Column(str, checks=pa.Check.isin([f"P{i}" for i in range(1, 21)])),
    "VehUsage": pa.Column(str, checks=pa.Check.isin([
        "Private+trip to office", "Professional", "Professional run",
    ])),
    "Garage": pa.Column(str, checks=pa.Check.isin([
        "Closed zbox", "Closed collective parking", "Opened collective parking", "Street",
    ])),
    ######################
    ### YOUR CODE HERE ###
    ######################
})

df_validated = schema_df.validate(df)

However, as you can see, some features contain NaN values. To handle this issue, you have several solutions:
* In the dataframe schema from pandera, set the option of possible NaN values to True. We don't recommend this approach as many models can't handle NaN values or you want to make sure to use them properly.
* You can set the option in the pandera schema to drop rows with NaN values.
* You can handle it with the classical feature engineering technics (feature imputing, creating new category, etc.)

In [None]:
# Solution
def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Fill missing values.

    Fill the missing values in the 'JobCode' and 'MaritalStatus' columns with the mode
    (most frequent value) of their respective columns.
    
    Args:
        df (pd.DataFrame): Input DataFrame with potential missing values.
    """
    df["JobCode"] = df["JobCode"].fillna(df["JobCode"].mode()[0])
    df["MaritalStatus"] = df["MaritalStatus"].fillna(df["MaritalStatus"].mode()[0])
    return df

In [24]:
# Solution
schema_df = pa.DataFrameSchema({
    "IDpol": pa.Column(str),
    "PayFreq": pa.Column(str, checks=pa.Check.isin(["Annual", "Half-yearly", "Quarterly", "Monthly"])),
    "VehClass": pa.Column(str, checks=pa.Check.isin([
        "Cheapest", "Cheaper", "Cheap", "Medium low", "Medium", "Medium high", "Expensive", "More expensive", "Most expensive",
    ])),
    "VehPower": pa.Column(str, checks=pa.Check.isin([f"P{i}" for i in range(1, 21)])),
    "VehUsage": pa.Column(str, checks=pa.Check.isin([
        "Private+trip to office", "Professional", "Professional run",
    ])),
    "Garage": pa.Column(str, checks=pa.Check.isin([
        "Closed zbox", "Closed collective parking", "Opened collective parking", "Street",
    ])),
    ######################
    ### YOUR CODE HERE ###
    ######################
    "Year": pa.Column(int, checks=[pa.Check.ge(2003), pa.Check.le(2004)], coerce=True),
    "DrivAge": pa.Column(int, checks=[pa.Check.ge(18), pa.Check.le(100)], coerce=True),
    "DrivGender": pa.Column(str, checks=pa.Check.isin(["M", "F"])),
    "MaritalStatus": pa.Column(str, checks=pa.Check.isin([
        "Cohabiting", "Married", "Single", "Widowed", "Divorced",
    ])),
    "BonusMalus": pa.Column(int, checks=pa.Check.ge(50), coerce=True),
    "LicenceNb": pa.Column(int, checks=pa.Check.ge(1), coerce=True),
    "JobCode": pa.Column(str, checks=pa.Check.isin([
        "Private employee", "Public employee", "Retiree", "Other", "Craftsman", "Farmer", "Retailer", "Unknown",
    ])),
    "VehAge": pa.Column(int, checks=pa.Check.ge(0), coerce=True),
    "VehGas": pa.Column(str, checks=pa.Check.isin(["Regular", "Diesel"])),
    "Area": pa.Column(str, checks=pa.Check.isin([f"A{i}" for i in range(1, 13)])),
})

df_filled = fill_missing_values(df)

df_validated = schema_df.validate(df_filled)