In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from datetime import datetime
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from ydata_profiling import ProfileReport

In [44]:
df = pd.read_csv("data/cleaned_data.csv")

In [45]:
# ProfileReport(df, explorative=True)

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,policy_number,policyholder,gender,occupation,branch,inst,plan,proposals,inception_date,expiry_date,monthly_premium,policy_value,paid_premium,premium,total_premium
0,1,UKS2023T03696,BEATRICE,FEMALE,OFFICER,ACCRA NO BRANCH,7010101,FAMILY SECURITY PLAN,12-Dec-23,1-Jul-18,1-Sep-27,30,500.0,87.3,3,1891.5
1,2,UKS2023T03701,DANIEL,MALE,TEACHER,ACCRA NO BRANCH,7010101,FAMILY SECURITY PLAN,7-Dec-23,1-Sep-21,1-Feb-48,60,2000.0,174.6,3,1687.8
2,3,UKS2024T00359,JEMIMA,FEMALE,TEACHING,ACCRA NO BRANCH,7010101,FAMILY SECURITY PLAN,25-Jan-24,1-Dec-21,1-Dec-52,150,2000.0,436.5,3,3783.0
3,4,UKS2023T03279,KWABENA,MALE,LABOURER,ACCRA NO BRANCH,7010101,FAMILY SECURITY PLAN,2-Nov-23,1-Feb-22,1-Feb-39,50,1500.0,145.5,3,1164.0
4,5,UKS2024T00204,SAMUEL,MALE,ACCOUNTANT,ACCRA NO BRANCH,8440101,FAMILY SECURITY PLAN,12-Jan-24,1-Feb-22,1-Feb-51,100,2000.0,0.0,0,100.0


In [47]:
df.drop(
    [
        "branch",
        "Unnamed: 0",
    ],
    axis=1,
)

Unnamed: 0,policy_number,policyholder,gender,occupation,inst,plan,proposals,inception_date,expiry_date,monthly_premium,policy_value,paid_premium,premium,total_premium
0,UKS2023T03696,BEATRICE,FEMALE,OFFICER,7010101,FAMILY SECURITY PLAN,12-Dec-23,1-Jul-18,1-Sep-27,30,500,87.3,3,1891.50
1,UKS2023T03701,DANIEL,MALE,TEACHER,7010101,FAMILY SECURITY PLAN,7-Dec-23,1-Sep-21,1-Feb-48,60,2000.00,174.6,3,1687.80
2,UKS2024T00359,JEMIMA,FEMALE,TEACHING,7010101,FAMILY SECURITY PLAN,25-Jan-24,1-Dec-21,1-Dec-52,150,2000.00,436.5,3,3783.00
3,UKS2023T03279,KWABENA,MALE,LABOURER,7010101,FAMILY SECURITY PLAN,2-Nov-23,1-Feb-22,1-Feb-39,50,1500.00,145.5,3,1164.00
4,UKS2024T00204,SAMUEL,MALE,ACCOUNTANT,8440101,FAMILY SECURITY PLAN,12-Jan-24,1-Feb-22,1-Feb-51,100,2000.00,0,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,UKS2024T00629,ESTHER,FEMALE,SEAMTRESS,CASH,FLEXI CHILD EDUCATION,4-Mar-24,1-Mar-24,1-May-41,100,2000.00,1300.00,2,1300.00
610,UKS2024T00644,JOHN,MALE,EDUCATIONIST,CASH,FLEXI CHILD EDUCATION,4-Mar-24,1-Mar-24,1-Sep-49,200,2000.00,200,1,200
611,UKS2024T00630,FRANCIS,MALE,PASTOR,CASH,FAMILY SECURITY PLAN,4-Mar-24,1-Mar-24,1-Aug-38,100,1500.00,100,1,100
612,UKS2024T00690,SILAS,MALE,TEACHER,7010101,FAMILY SECURITY PLAN,13-Mar-24,1-Mar-24,1-Feb-50,100,0,100.88,1,100.88


In [48]:
occupations = df['occupation'].unique()
print(occupations)

['OFFICER' 'TEACHER' 'TEACHING' 'LABOURER' 'ACCOUNTANT' 'ADMIN'
 'PUBLIC SERVANT' 'NURSE' 'CLERK' 'MATRON' 'NURSING' 'CO-ORDINATOR' 'COOK'
 'CIVIL SERVANT' 'COURT REGISTER' 'GHS' 'PUBLICE SERVANT' 'PLUMBING'
 'GARDENER' 'DEVELOPMENT CONTROL' 'PRINCIPAL ADMIN' 'TUTOR' 'SECRETARY'
 'BIOSTATISTICS ASSIST' 'MIDWIFE' 'TECHER' 'OPTICAL TECHNICIAN'
 'ADMINISTRATION' 'NBSSI' 'REGULATOR' 'SECURITY' 'BUSSINESS WOMAN'
 'STENOGRAPHER' 'INFORMATION OFFICER' 'TEAHING' 'ADMINISTRATOR' 'LECTURER'
 'MIDWIFERY' 'TRADING' 'ENGINEER' 'CHEF' 'COURT' 'LABOUR' 'COOKER'
 'AUTO BODYING' 'MESSENGER' 'RESOURCE GUARD' 'QUANTITY SURVEYOR'
 'PHYSICIAN ASSISTANT' 'DATA PROTECTION OFFI' 'OPERATOR' 'FIRE FIGHTER'
 'AGRIC EXTENSION' 'BURSE' 'LIBARIAN' 'HEALTH OFFICER'
 'SARPOSITORY ATISAN' 'CLEANER' 'DRIVER' 'GENERAL LAB' 'LOADING OPERATOR'
 'PARTRY HANDS' 'AGRIC OFFICER' 'PANTRYMAN' 'HUMAN RESOURCES'
 'ANAESTHESIA' 'CARPENTRY' 'CARPENTER' 'SALES OOFICER' 'WELDING'
 'CARETAKER' 'POLICE' 'TRADER' 'CONSULTANT' 'PQCA' 'CO

In [34]:
import re

corrections = {
    r"\bTEACHING\b": "TEACHER",
    r"\bPUBLICE SERVANT\b": "PUBLIC SERVANT",
    r"\bTECHER\b": "TEACHER",
    r"\bTEAHING\b": "TEACHER",
    r"\bBURSE\b": "NURSE",
    r"\bBUSSINESS WOMAN\b": "BUSINESS OWNER",
    r"\bLIBARIAN\b": "LIBRARIAN",
    r"\bSARPOSITORY ATISAN\b": "SUPPOSITORY ARTISAN",
    r"\bPARTRY HANDS\b": "PANTRY ASSISTANT",
    r"\bAGRIC EXTENSION\b": "AGRICULTURAL EXTENSION OFFICER",
    r"\bAGRIC OFFICER\b": "AGRICULTURAL OFFICER",
    r"\bSALES OOFICER\b": "SALES OFFICER",
    r"\bAUTO MACHINIES\b": "AUTO MACHINES",
    r"\bPOLICING\b": "POLICE OFFICER",
    r"\bCATRER\b": "CATERER",
    r"\bSTATISCIAN\b": "STATISTICIAN",
    r"\bBUSINESSWOMAN\b": "BUSINESS",
    r"\bCAPENTER\b": "CARPENTER",
    r"\bVEGETABLES SELLER\b": "VEGETABLE SELLER",
    r"\bMEDICAL LABORATORY S\b": "MEDICAL LABORATORY SCIENTIST",
    r"\bDATA PROTECTION OFFI\b": "DATA PROTECTION OFFICER",
}

for pattern, correction in corrections.items():
    # professions = [re.sub(pattern, correction, p) for p in professions]

    # replace these in the dataframe
    for pattern, correction in corrections.items():
        df["occupation"] = df["occupation"].str.replace(pattern, correction, regex=True)

In [35]:
professions = df["occupation"].unique()
# print(professions)
# len(professions)

In [36]:
categories = {
    "EDUCATION": [
        "TEACHER",
        "LECTURER",
        "EDUCATIONIST",
        "PRINCIPAL ADMIN",
        "TUTOR",
        "LIBRARIAN",
    ],
    "HEALTHCARE": [
        "NURSE",
        "NURSING",
        "MATRON",
        "MIDWIFE",
        "HEALTH OFFICER",
        "PHYSICIAN ASSISTANT",
        "MEDICAL DOCTOR",
        "MEDICAL LABORATORY SCIENTIST",
        "HEALTH ASSIST",
    ],
    "ADMINISTRATION": [
        "ADMIN",
        "STENOGRAPHER",
        "PUBLIC SERVANT",
        "CIVIL SERVANT",
        "ADMINISTRATOR",
        "ADMINISTRATION",
        "PRINCIPAL ADMIN",
        "HUMAN RESOURCES",
        "REGULATOR",
        "CLERK",
        "CO-ORDINATOR",
        "SECRETARY",
        "MESSENGER",
    ],
    "ENGINEERING AND CONSTRUCTION": [
        "ENGINEER",
        "QUANTITY SURVEYOR",
        "AUTO BODYING",
        "ELECTRIC/WELDER",
        "CARPENTER",
        "CARPENTRY",
        "WELDING",
        "LABOUR",
        "LABOURER",
        "AUTO MACHINES",
        "EXCAVATOR OPERATOR",
        "LOADING OPERATOR",
        "OPERATOR",
        "PLUMBING",
        "PLUMBER",
    ],
    "ARTS AND DESIGN": ["PHOTOGRAHER", "FASHION DESIGNER", "SEAMTRESS"],
    "RELIGION": ["CLERGY", "PASTOR"],
    "TRANSPORTATION": ["DRIVER"],
    "GOVERNMENT": [
        "GHS",
        "NBSSI",
        "COURT",
        "COURT REGISTER",
        "OFFICER",
        "COCOBOD",
        "CONSERVANCY",
    ],
    "HOSPITALITY": [
        "COOK",
        "CHEF",
        "COOKER",
        "PANTRYMAN",
        "CATERER",
        "PANTRY ASSISTANT",
        "CLEANER",
        "CARETAKER",
        "GARDENER",
    ],
    "MANAGEMENT": [
        "MANAGER",
        "ADMINISTRATOR",
        "BUSINESS OWNER",
        "CONSULTANT",
        "CONTRACTOR",
        "INSURANCE",
        "DEVELOPMENT CONTROL",
    ],
    # "CONSTRUCTION": ["CARPENTER", "CARPENTRY" "WELDING", "LABOURER","AUTO MACHINES", "EXCAVATOR OPERATOR"],
    "FINANCE": ["ACCOUNTANT", "INSURER", "STATISTICIAN", "MTN AGENT"],
    "SALES": ["SALES OFFICER", "SALES EXECUTIVE", "TRADER", "TRADING"],
    "IT": ["INFORMATION OFFICER", "DATA PROTECTION OFFICER"],
    "AGRICULTURE": [
        "AGRICULTURAL EXTENSION OFFICER",
        "AGRICULTURAL OFFICER",
        "AGRICULTURIST",
        "VEGETABLE SELLER",
        "EXTENSION OFFICER",
    ],
    "LAW AND SAFETY ENFORCEMENT": [
        "POLICE",
        "POLICE OFFICER",
        "OFFICER",
        "SECURITY",
        "GUARD",
        "FIRE FIGHTER",
        "RESOURCE GUARD",
    ],
    "MEDICAL": [
        "MEDICAL DOCTOR",
        "MEDICAL LABORATORY SCIENTIST",
        "PHARMACIST",
        "BIOSTATISTICS ASSIST",
        "GENERAL LAB",
        "SUPPOSITORY ARTISAN",
        "HEAD ORDERLY",
        "OPTICAL TECHNICIAN",
        "ANAESTHESIA",
        "MIDWIFERY",
    ],
    "OTHERS": [
        "PQCA",
    ],
}


# check if all  professions are in the categories
for profession in professions:
    found = False
    for category, members in categories.items():
        if profession in members:
            found = True
            break
    if not found:
        print(profession)

In [37]:
categories = {
    "EDUCATION": [
        "TEACHER",
        "LECTURER",
        "EDUCATIONIST",
        "PRINCIPAL ADMIN",
        "TUTOR",
        "LIBRARIAN",
    ],
    "HEALTHCARE": [
        "NURSE",
        "NURSING",
        "MATRON",
        "MIDWIFE",
        "HEALTH OFFICER",
        "PHYSICIAN ASSISTANT",
        "MEDICAL DOCTOR",
        "MEDICAL LABORATORY SCIENTIST",
        "HEALTH ASSIST",
    ],
    "ADMINISTRATION": [
        "ADMIN",
        "STENOGRAPHER",
        "PUBLIC SERVANT",
        "CIVIL SERVANT",
        "ADMINISTRATOR",
        "ADMINISTRATION",
        "PRINCIPAL ADMIN",
        "HUMAN RESOURCES",
        "REGULATOR",
        "CLERK",
        "CO-ORDINATOR",
        "SECRETARY",
        "MESSENGER",
    ],
    "ENGINEERING AND CONSTRUCTION": [
        "ENGINEER",
        "QUANTITY SURVEYOR",
        "AUTO BODYING",
        "ELECTRIC/WELDER",
        "CARPENTER",
        "CARPENTRY",
        "WELDING",
        "LABOUR",
        "LABOURER",
        "AUTO MACHINES",
        "EXCAVATOR OPERATOR",
        "LOADING OPERATOR",
        "OPERATOR",
        "PLUMBING",
        "PLUMBER",
    ],
    "ARTS AND DESIGN": ["PHOTOGRAHER", "FASHION DESIGNER", "SEAMTRESS"],
    "RELIGION": ["CLERGY", "PASTOR"],
    "TRANSPORTATION": ["DRIVER"],
    "GOVERNMENT": [
        "GHS",
        "NBSSI",
        "COURT",
        "COURT REGISTER",
        "OFFICER",
        "COCOBOD",
        "CONSERVANCY",
    ],
    "HOSPITALITY": [
        "COOK",
        "CHEF",
        "COOKER",
        "PANTRYMAN",
        "CATERER",
        "PANTRY ASSISTANT",
        "CLEANER",
        "CARETAKER",
        "GARDENER",
    ],
    "MANAGEMENT": [
        "MANAGER",
        "ADMINISTRATOR",
        "BUSINESS OWNER",
        "CONSULTANT",
        "CONTRACTOR",
        "INSURANCE",
        "DEVELOPMENT CONTROL",
    ],
    # "CONSTRUCTION": ["CARPENTER", "CARPENTRY" "WELDING", "LABOURER","AUTO MACHINES", "EXCAVATOR OPERATOR"],
    "FINANCE": ["ACCOUNTANT", "INSURER", "STATISTICIAN", "MTN AGENT"],
    "SALES": ["SALES OFFICER", "SALES EXECUTIVE", "TRADER", "TRADING"],
    "IT": ["INFORMATION OFFICER", "DATA PROTECTION OFFICER"],
    "AGRICULTURE": [
        "AGRICULTURAL EXTENSION OFFICER",
        "AGRICULTURAL OFFICER",
        "AGRICULTURIST",
        "VEGETABLE SELLER",
        "EXTENSION OFFICER",
    ],
    "LAW AND SAFETY ENFORCEMENT": [
        "POLICE",
        "POLICE OFFICER",
        "OFFICER",
        "SECURITY",
        "GUARD",
        "FIRE FIGHTER",
        "RESOURCE GUARD",
    ],
    "MEDICAL": [
        "MEDICAL DOCTOR",
        "MEDICAL LABORATORY SCIENTIST",
        "PHARMACIST",
        "BIOSTATISTICS ASSIST",
        "GENERAL LAB",
        "SUPPOSITORY ARTISAN",
        "HEAD ORDERLY",
        "OPTICAL TECHNICIAN",
        "ANAESTHESIA",
        "MIDWIFERY",
    ],
    "OTHERS": [
        "PQCA",
    ],
}


# create a new column for the categorized profession
df["profession_category"] = None

# iterate over the professions and assign the category
for index, occupation in enumerate(df["occupation"]):
    for category, professions in categories.items():
        if occupation in professions:
            df.loc[index, "profession_category"] = category
            break

In [38]:
# Convert numerical columns with commas to floats
df["policy_value"] = df["policy_value"].str.replace(",", "").astype(float)
df["total_premium"] = df["total_premium"].str.replace(",", "").astype(float)

In [39]:
# # Convert dates and derive additional features
# df["inception_date"] = pd.to_datetime(df["inception_date"], format="%d-%b-%y")
# df["expiry_date"] = pd.to_datetime(df["expiry_date"], format="%d-%b-%y")

In [40]:
# # Derive policy_age and time_to_expiry
# df["policy_age"] = (pd.to_datetime("today") - df["inception_date"]).dt.days / 365.25
# df["time_to_expiry"] = (df["expiry_date"] - pd.to_datetime("today")).dt.days / 365.25

In [41]:
df.columns

Index(['Unnamed: 0', 'policy_number', 'policyholder', 'gender', 'occupation',
       'branch', 'inst', 'plan', 'proposals', 'inception_date', 'expiry_date',
       'monthly_premium', 'policy_value', 'paid_premium', 'premium',
       'total_premium', 'profession_category'],
      dtype='object')

In [42]:
df.drop(
    [
        "Unnamed: 0",
        "occupation",
        "branch",
        "inst",
        
    ],
    axis=1,
)

Unnamed: 0,policy_number,policyholder,gender,plan,proposals,inception_date,expiry_date,monthly_premium,policy_value,paid_premium,premium,total_premium,profession_category
0,UKS2023T03696,BEATRICE,FEMALE,FAMILY SECURITY PLAN,12-Dec-23,1-Jul-18,1-Sep-27,30,500.0,87.3,3,1891.50,GOVERNMENT
1,UKS2023T03701,DANIEL,MALE,FAMILY SECURITY PLAN,7-Dec-23,1-Sep-21,1-Feb-48,60,2000.0,174.6,3,1687.80,EDUCATION
2,UKS2024T00359,JEMIMA,FEMALE,FAMILY SECURITY PLAN,25-Jan-24,1-Dec-21,1-Dec-52,150,2000.0,436.5,3,3783.00,EDUCATION
3,UKS2023T03279,KWABENA,MALE,FAMILY SECURITY PLAN,2-Nov-23,1-Feb-22,1-Feb-39,50,1500.0,145.5,3,1164.00,ENGINEERING AND CONSTRUCTION
4,UKS2024T00204,SAMUEL,MALE,FAMILY SECURITY PLAN,12-Jan-24,1-Feb-22,1-Feb-51,100,2000.0,0,0,100.00,FINANCE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,UKS2024T00629,ESTHER,FEMALE,FLEXI CHILD EDUCATION,4-Mar-24,1-Mar-24,1-May-41,100,2000.0,1300.00,2,1300.00,ARTS AND DESIGN
610,UKS2024T00644,JOHN,MALE,FLEXI CHILD EDUCATION,4-Mar-24,1-Mar-24,1-Sep-49,200,2000.0,200,1,200.00,EDUCATION
611,UKS2024T00630,FRANCIS,MALE,FAMILY SECURITY PLAN,4-Mar-24,1-Mar-24,1-Aug-38,100,1500.0,100,1,100.00,RELIGION
612,UKS2024T00690,SILAS,MALE,FAMILY SECURITY PLAN,13-Mar-24,1-Mar-24,1-Feb-50,100,0.0,100.88,1,100.88,EDUCATION


In [43]:
# make pipelline and select all numerical and categorical with variance threshold of 0.8
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Select all numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Select all categorical columns
categorical_columns = df.select_dtypes(include=[np.object]).columns

# Create a pipeline for numerical columns
numerical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("variance_threshold", VarianceThreshold(threshold=0.8)),
    ]
)

# Create a pipeline for categorical columns
categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_columns),
        ("cat", categorical_pipeline, categorical_columns),
    ]
)

# Split the data into training and testing sets
X = df.drop(["policy_value"], axis=1)
y = df["policy_value"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit the preprocessor
preprocessor.fit(X_train)

# Transform the training and testing sets
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Create a linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train_transformed, y_train)



  categorical_columns = df.select_dtypes(include=[np.object]).columns


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
# # Normalize numerical variables if necessary
# scaler = StandardScaler()
# df[
#     [
#         "policy_value",
#         "paid_premium",
#         "premium",
#         "total_premium",
#         "policy_age",
#         "time_to_expiry",
#     ]
# ] = scaler.fit_transform(
#     df[
#         [
#             "policy_value",
#             "paid_premium",
#             "premium",
#             "total_premium",
#             "policy_age",
#             "time_to_expiry",
#         ]
#     ]
# )

ValueError: could not convert string to float: '2,328.00'

In [None]:
# # Outlier Detection and Handling
# # Using Z-score method to detect outliers
# def remove_outliers(df, columns):
#     for col in columns:
#         df = df[(np.abs(df[col] - df[col].mean()) / df[col].std() <= 3)]
#     return df


# df = remove_outliers(df, num_cols)

In [None]:
# df.head()

In [None]:
# # Feature Selection
# # Calculate correlation matrix
# corr_matrix = df.corr()
# plt.figure(figsize=(30, 10))
# sns.heatmap(corr_matrix, annot=True, fmt='.2f')
# plt.show()

In [None]:
# # Select features based on correlation with the target variable
# correlation_threshold = 0.1
# relevant_features = corr_matrix["monthly_premium"][
#     corr_matrix["monthly_premium"].abs() > correlation_threshold
# ].index
# df = df[relevant_features]

In [None]:
# # 7. Split the dataset into training and testing sets
# X = df.drop("monthly_premium", axis=1)
# y = df["monthly_premium"]
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

In [None]:
# print(
#     f"X_train shape: {X_train.shape}\n X_test shape: {X_test.shape}\n y_train shape: {y_train.shape}\n y_test shape: {y_test.shape}"
# )

In [None]:
# # Initialize models
# linear_reg = LinearRegression()
# decision_tree = DecisionTreeRegressor(random_state=42)
# random_forest = RandomForestRegressor(random_state=42)

In [None]:
# # Train models
# linear_reg.fit(X_train, y_train)
# decision_tree.fit(X_train, y_train)
# random_forest.fit(X_train, y_train)

In [None]:
# # Predict on test set
# y_pred_lr = linear_reg.predict(X_test)
# y_pred_dt = decision_tree.predict(X_test)
# y_pred_rf = random_forest.predict(X_test)

In [None]:
# # Evaluate models
# from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


# def evaluate_model(y_test, y_pred):
#     rmse = mean_squared_error(y_test, y_pred, squared=False)  # Old way, now deprecated
#     rmse = np.sqrt(
#         mean_squared_error(y_test, y_pred)
#     )  # New way, using numpy's sqrt function
#     mae = mean_absolute_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)
#     return rmse, mae, r2


# # Linear Regression evaluation
# rmse_lr, mae_lr, r2_lr = evaluate_model(y_test, y_pred_lr)
# print(f"Linear Regression - RMSE: {rmse_lr}, MAE: {mae_lr}, R²: {r2_lr}")

# # Decision Tree Regressor evaluation
# rmse_dt, mae_dt, r2_dt = evaluate_model(y_test, y_pred_dt)
# print(f"Decision Tree Regressor - RMSE: {rmse_dt}, MAE: {mae_dt}, R²: {r2_dt}")

# # Random Forest Regressor evaluation
# rmse_rf, mae_rf, r2_rf = evaluate_model(y_test, y_pred_rf)
# print(f"Random Forest Regressor - RMSE: {rmse_rf}, MAE: {mae_rf}, R²: {r2_rf}")

In [None]:
# from sklearn.metrics import (
#     mean_squared_error,
#     mean_absolute_error,
#     r2_score,
#     mean_squared_log_error,
# )


# # Evaluate models


# def evaluate_model(y_test, y_pred):

#     rmse = mean_squared_error(
#         y_test, y_pred, squared=False
#     )  # Use squared=False to get RMSE directly

#     mae = mean_absolute_error(y_test, y_pred)

#     r2 = r2_score(y_test, y_pred)

#     return rmse, mae, r2


# # Linear Regression evaluation


# rmse_lr, mae_lr, r2_lr = evaluate_model(y_test, y_pred_lr)


# print(f"Linear Regression - RMSE: {rmse_lr}, MAE: {mae_lr}, R²: {r2_lr}")


# # Decision Tree Regressor evaluation


# rmse_dt, mae_dt, r2_dt = evaluate_model(y_test, y_pred_dt)


# print(f"Decision Tree Regressor - RMSE: {rmse_dt}, MAE: {mae_dt}, R²: {r2_dt}")


# # Random Forest Regressor evaluation


# rmse_rf, mae_rf, r2_rf = evaluate_model(y_test, y_pred_rf)


# print(f"Random Forest Regressor - RMSE: {rmse_rf}, MAE: {mae_rf}, R²: {r2_rf}")


# # Feature importance from Random Forest


# importances = random_forest.feature_importances_


# feature_names = X.columns


# feature_importance_df = pd.DataFrame(
#     {"Feature": feature_names, "Importance": importances}
# )


# feature_importance_df = feature_importance_df.sort_values(
#     by="Importance", ascending=False
# )


# plt.figure(figsize=(10, 6))


# sns.barplot(x="Importance", y="Feature", data=feature_importance_df)


# plt.title("Feature Importance from Random Forest")


# plt.show()

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Hyperparameter tuning for Random Forest
# param_grid = {
#     "n_estimators": [100, 200, 300],
#     "max_features": ["auto", "sqrt", "log2"],
#     "max_depth": [None, 10, 20, 30],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4],
# }

# grid_search = GridSearchCV(
#     estimator=random_forest, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2
# )
# grid_search.fit(X_train, y_train)

# best_rf = grid_search.best_estimator_

# # Evaluate the tuned Random Forest model
# y_pred_best_rf = best_rf.predict(X_test)
# rmse_best_rf, mae_best_rf, r2_best_rf = evaluate_model(y_test, y_pred_best_rf)
# print(
#     f"Tuned Random Forest Regressor - RMSE: {rmse_best_rf}, MAE: {mae_best_rf}, R²: {r2_best_rf}"
# )