##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [2]:
#%pip install pandas
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [3]:
# Can have as many cells as you want for code
import pandas as pd
import numpy as np
filepath = "./data/catB_train.parquet"
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below.
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [4]:
df = pd.read_parquet(filepath)
df.isna().sum()

FileNotFoundError: [Errno 2] No such file or directory: './data/catB_train.parquet'

To get rid of the missing values in our target column, we replaced missing values with 0 instead.

In [None]:
df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)

In [None]:
# Set the threshold for missing values (50%)
threshold = 0.5 * len(df)

# Drop columns with 50% or more missing values
df = df.dropna(axis=1, thresh=threshold)
df.shape #dropped 82 columns

(17992, 222)

In [None]:
# Check which columns have object data types
print(df.select_dtypes(include='object').columns.tolist())

# Drop clntnum, ctrycode and date columns
df = df.drop(columns=['clntnum', 'min_occ_date', 'cltdob_fix', 'ctrycode_desc'])
df_columns_obj = df.select_dtypes(include='object').columns.tolist()

# fill missing obj values with mode
for title in df_columns_obj:
  df[title] = df[title].fillna(df[title].mode()[0])

### Manually change the object datatypes to integers / floats types
# all ape/sumins/prempaid > float
to_float = [col for col in df.columns if 'ape' in col or 'sumins' in col or 'prempaid' in col]
df[to_float] = df[to_float].astype(float)

# all n_months, hh_20, pop_20 > int
to_int = [col for col in df.columns if 'n_months' in col]
to_int.extend(['hh_20','pop_20'])
df[to_int] = df[to_int].astype('int64')

# race, clnt type, stat, sex > nominal, one hot encoding
to_one_hot = ['race_desc', 'clttype', 'stat_flag', 'cltsex_fix']
df = pd.get_dummies(df, columns=to_one_hot)

# hh_size, annual income > ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
to_ordinal = ['hh_size_est', 'annual_income_est']
categories = [['0','1', '2', '3', '4', '>4'], ['E.BELOW30K','D.30K-60K','C.60K-100K' ,'B.100K-200K','A.ABOVE200K']]
encoder = OrdinalEncoder(categories=categories)
df[to_ordinal] = encoder.fit_transform(df[to_ordinal])

['clntnum', 'race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'min_occ_date', 'cltdob_fix', 'cltsex_fix', 'hh_20', 'pop_20', 'hh_size_est', 'annual_income_est', 'ape_gi_42e115', 'ape_ltc_1280bf', 'ape_grp_6fc3e6', 'ape_grp_de05ae', 'ape_inv_dcd836', 'ape_grp_945b5a', 'ape_grp_6a5788', 'ape_ltc_43b9d5', 'ape_grp_9cdedf', 'ape_lh_d0adeb', 'ape_grp_1581d7', 'ape_grp_22decf', 'ape_lh_507c37', 'ape_lh_839f8a', 'ape_inv_e9f316', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_grp_caa6ff', 'ape_grp_fd3bfb', 'ape_lh_e22a6a', 'ape_grp_70e1dd', 'ape_grp_e04c3a', 'ape_grp_fe5fb8', 'ape_gi_856320', 'ape_grp_94baec', 'ape_gi_058815', 'ape_grp_e91421', 'ape_lh_f852af', 'ape_lh_947b15', 'ape_32c74c', 'sumins_gi_42e115', 'sumins_ltc_1280bf', 'sumins_grp_6fc3e6', 'sumins_grp_de05ae', 'sumins_inv_dcd836', 'sumins_grp_945b5a', 'sumins_grp_6a5788', 'sumins_ltc_43b9d5', 'sumins_grp_9cdedf', 'sumins_lh_d0adeb', 'sumins_grp_1581d7', 'sumins_grp_22decf', 'sumins_lh_507c37', 'sumins_inv_e9f316', 'sumins_gi_a10d1b',

In [None]:
# Identifiy numeric columns and fill null values with the median value
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

y = df["f_purchase_lh"]
X = df.drop(columns=['f_purchase_lh'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

print('Before:', Counter(y_train))

Before: Counter({0.0: 13809, 1.0: 584})


In [None]:
# Finalizing selected features (selected based on consistency across methods and domain knowledge)
selected = ["n_months_last_bought_gi",
                  "is_valid_email",
                  "n_months_last_bought_products",
                  "f_ever_bought_gi",
                  "is_valid_dm",
                  "f_mindef_mha",
                  "f_retail",
                  "hh_size",
                  "pop_20" ]
X_train = X_train[selected]

In [None]:
X_val = X_val[selected]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_val)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_val, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_pred_rf)
classification_rep_rf = classification_report(y_val, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Random Forest Confusion Matrix:\n{conf_matrix_rf}')
print(f'Random Forest Classification Report:\n{classification_rep_rf}')

Random Forest Accuracy: 0.9602667407613226
Random Forest Confusion Matrix:
[[3445   28]
 [ 115   11]]
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      3473
         1.0       0.28      0.09      0.13       126

    accuracy                           0.96      3599
   macro avg       0.62      0.54      0.56      3599
weighted avg       0.94      0.96      0.95      3599



## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list).
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:

    df = hidden_data
    df = df.drop(columns=['clntnum', 'min_occ_date', 'cltdob_fix', 'ctrycode_desc'])
    df_columns_obj = df.select_dtypes(include='object').columns.tolist()

# fill missing obj values with mode
    for title in df_columns_obj:
        df[title] = df[title].fillna(df[title].mode()[0])

### Manually change the object datatypes to integers / floats types
# all ape/sumins/prempaid > float
    to_float = [col for col in df.columns if 'ape' in col or 'sumins' in col or 'prempaid' in col]
    df[to_float] = df[to_float].astype(float)

# all n_months, hh_20, pop_20 > int
    to_int = [col for col in df.columns if 'n_months' in col]
    to_int.extend(['hh_20','pop_20'])
    df[to_int] = df[to_int].astype('int64')

# race, clnt type, stat, sex > nominal, one hot encoding
    to_one_hot = ['race_desc', 'clttype', 'stat_flag', 'cltsex_fix']
    df = pd.get_dummies(df, columns=to_one_hot)

# hh_size, annual income > ordinal encoding
    from sklearn.preprocessing import OrdinalEncoder
    to_ordinal = ['hh_size_est', 'annual_income_est']
    categories = [['0','1', '2', '3', '4', '>4'], ['E.BELOW30K','D.30K-60K','C.60K-100K' ,'B.100K-200K','A.ABOVE200K']]
    encoder = OrdinalEncoder(categories=categories)
    df[to_ordinal] = encoder.fit_transform(df[to_ordinal])
    
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

    selected = ["n_months_last_bought_gi",
                  "is_valid_email",
                  "n_months_last_bought_products",
                  "f_ever_bought_gi",
                  "is_valid_dm",
                  "f_mindef_mha",
                  "f_retail",
                  "hh_size",
                  "pop_20" ]
    X_train = X_train[selected]

    X_val = X_val[selected]

    df[df.columns] = scaler.fit_transform(df[df.columns])
    
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train)

    y_pred_rf = rf_model.predict(X_val)
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform.

All relevant code MUST be included in this function.'''
    result = [y_pred_rf]
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

NameError: name 'pd' is not defined

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!