##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [4]:
#%pip install pandas 
#%pip install matplotlib
#%pip install pyarrow
#%pip install seaborn
#%pip install scikit-learn
#%pip install matplotlib

# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

Collecting pyarrow
  Downloading pyarrow-15.0.0-cp39-cp39-win_amd64.whl (24.9 MB)
     --------------------------------------- 24.9/24.9 MB 17.2 MB/s eta 0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-15.0.0
Note: you may need to restart the kernel to use updated packages.


## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [4]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [1]:
###...code...###
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import f1_score
from datetime import date
from sklearn.ensemble import RandomForestClassifier


### 1. DATA PROCESSING
# read
df = pd.read_parquet('./data/catB_train.parquet')
hidden_data = df.copy(deep=True)

# Investigate target column and understand the data
# Since target column consists only of NaN or 1, convert NaNs to 0.
print(df["f_purchase_lh"].unique())

# Step 1: Clean f_purchase_lh
df['f_purchase_lh'] = df['f_purchase_lh'].fillna(0)

# Label encoder handles non-numerical data such as annual income. Since annual income is in the format: C.60K-100K,
# Convert String labels such as C.60K-100K to labels like 0,1,2,3,4 so that the model can parse the data
label_encoder = LabelEncoder()

## Labelling Client Type
df['clttype'] = label_encoder.fit_transform(df['clttype'])

## Labelling annual income categories
df['annual_income_est']= label_encoder.fit_transform(df['annual_income_est'])

## Convert date of birth to age
df['cltdob_fix'] = df['cltdob_fix'].replace(to_replace='None', value=np.nan).dropna()
df['cltdob_fix'] = pd.to_datetime(df['cltdob_fix'], format='%Y-%m-%d')
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
df['cltdob_fix'] = df['cltdob_fix'].apply(calculate_age)

## Drop NA for purpose of training the model
df['cltdob_fix'] = df['cltdob_fix'].dropna()
### DATA PROCESSING



### 2. CORRELATION MATRIX & HEATMAP FOR FEATURE SELECTION
## Using Correlation matrix, we selected all features/variables with a correlation of > 5% to the target variable
## Uncomment code below to see Heatmap of correlation matrix
'''
columns_to_test = ["flg_gi_claim","flg_is_proposal","is_housewife_retiree", "is_sg_pr", "is_class_1_2","annual_income_est","n_months_last_bought_products"
                   ,"flg_latest_being_lapse","recency_lapse", "recency_cancel","tot_inforce_pols","f_mindef_mha","recency_clmcon","recency_giclaim","cltdob_fix","f_purchase_lh"]

# Test Correlation matrix
corr_matrix = df.corr()
print(corr_matrix)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()
'''
### CORRELATION MATRIX & HEATMAP FOR FEATURE SELECTION


### 3. FILTERING & UPSAMPLING
columns_to_keep = ["flg_gi_claim","flg_is_proposal","is_housewife_retiree", "is_sg_pr", "is_class_1_2","annual_income_est","n_months_last_bought_products"
                   ,"flg_latest_being_lapse","recency_lapse", "recency_cancel","tot_inforce_pols","f_mindef_mha","recency_clmcon","recency_giclaim","cltdob_fix","f_purchase_lh"]

df = df[columns_to_keep]
df = df.fillna(0)

# Separate majority and minority classes
df_majority = df[df['f_purchase_lh'] == 0]
df_minority = df[df['f_purchase_lh'] == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # Sample with replacement
                                 n_samples=17282,  # to match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df = df_upsampled
### FILTERING AND UPSAMPLING


### 4. TRAINING THE MODEL
## We selected Random Forest since it is robust to overfitting and tends to perform well without much hyperparameter tuning.
X = df[["flg_gi_claim","flg_is_proposal","is_housewife_retiree", "is_sg_pr", "is_class_1_2","annual_income_est","n_months_last_bought_products"
                   ,"flg_latest_being_lapse","recency_lapse", "recency_cancel","tot_inforce_pols","f_mindef_mha","recency_clmcon","recency_giclaim","cltdob_fix"]]
y = df["f_purchase_lh"]

# Split the dataset into training and testing sets with 20% for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)
### TRAINING THE MODEL



[nan  1.]
F1 score: 0.989023521026372


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [6]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    hidden_data['cltdob_fix'] = hidden_data['cltdob_fix'].replace(to_replace='None', value=np.nan).dropna()
    hidden_data['cltdob_fix'] = pd.to_datetime(hidden_data['cltdob_fix'], format='%Y-%m-%d')
    hidden_data['cltdob_fix'] = hidden_data['cltdob_fix'].apply(calculate_age)

    # Label encoding of income
    hidden_data['annual_income_est']= label_encoder.fit_transform(hidden_data['annual_income_est'])

    hidden_data['clttype'] = label_encoder.fit_transform(hidden_data['clttype']) 
    columns_to_keep = ["flg_gi_claim","flg_is_proposal","is_housewife_retiree", "is_sg_pr", "is_class_1_2","annual_income_est","n_months_last_bought_products"
                       ,"flg_latest_being_lapse","recency_lapse", "recency_cancel","tot_inforce_pols","f_mindef_mha","recency_clmcon","recency_giclaim","cltdob_fix"]
    hidden_data = hidden_data[columns_to_keep]
    hidden_data = hidden_data.fillna(0)
    result = rf_classifier.predict(hidden_data)
    return result


##### Cell to check testing_hidden_data function

In [7]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

[0. 0. 0. ... 0. 0. 0.]


### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!