In [1]:
import numpy as np
import joblib
import pandas as pd


In [2]:
# Load the entire pipeline
full_pipeline = joblib.load('full_pipeline.pkl')
loaded_data = np.load('X_transformed.npz')
X_transformed_loaded = loaded_data['X_transformed']
preprocessor = joblib.load("preprocessor_pipeline.pkl")
df = pd.read_csv("df_cluster.csv")

In [3]:
full_pipeline

## Prediction

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

def find_nearby_matches_with_clusters(person_details, full_pipeline, df_original, x_transformed, n_matches=5):
    # Convert the input person details to a DataFrame (ensure column order matches)
    person_df = pd.DataFrame([person_details], columns=df_original.drop(columns="Cluster").columns)

    # Apply the same preprocessing (including one-hot encoding) to person_details
    person_transformed = full_pipeline['preprocessor'].transform(person_df)

    # Predict the cluster for the new input person
    predicted_cluster = full_pipeline['cluster'].predict(person_transformed)[0]

    # Filter the original DataFrame to include only rows from the same cluster
    cluster_indices = df_original[df_original['Cluster'] == predicted_cluster].index
    x_clustered_transformed = x_transformed[cluster_indices]

    # Calculate distances between the new input and the transformed data within the same cluster
    distances = pairwise_distances(person_transformed, x_clustered_transformed)

    # Get the indices of the closest matches within the cluster
    closest_indices_in_cluster = np.argsort(distances[0])[:n_matches]

    # Retrieve the indices of the original data for those closest matches
    closest_indices = cluster_indices[closest_indices_in_cluster]

    # Get the closest matches from the original DataFrame
    closest_matches_df = df_original.iloc[closest_indices].copy()
    
    # Add a column for the distance
    closest_matches_df['Distance'] = distances[0, closest_indices_in_cluster]

    return closest_matches_df


In [5]:
# Example of a person's details (input should match the feature names)
person_details = {
    'Age': 30,
    'latitude': 40.7128,
    'longitude': -74.0060,
    'Ethnicity': 'Mexican',
    'Diabetic': 'No',
    'Religion': 'Christianity',
    'Height': "Short",
    'last_name': 'Gomez'
}

# Find the nearby matches for the input person details
matches = find_nearby_matches_with_clusters(person_details, full_pipeline, df,X_transformed_loaded)
print(matches)

      Age   latitude  longitude Ethnicity Diabetic      Religion Height  \
8198   45  38.421729 -73.406698   Mexican       No  Christianity  Short   
4064   34  40.978783 -90.930200   Mexican       No  Christianity  Short   
4044   28  42.002930 -76.893765   Italian       No  Christianity  Short   
8181   35  40.032933 -77.641011     Irish       No  Christianity  Short   
4568   38  48.536056 -71.310230   Mexican       No  Christianity  Short   

     last_name  Cluster  Distance  
8198  Campbell        1  1.698346  
4064       Lee        1  1.751818  
4044       Lee        1  1.753561  
8181       Lee        1  1.772711  
4568   Johnson        1  1.846639  


In [6]:
matches

Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Height,last_name,Cluster,Distance
8198,45,38.421729,-73.406698,Mexican,No,Christianity,Short,Campbell,1,1.698346
4064,34,40.978783,-90.9302,Mexican,No,Christianity,Short,Lee,1,1.751818
4044,28,42.00293,-76.893765,Italian,No,Christianity,Short,Lee,1,1.753561
8181,35,40.032933,-77.641011,Irish,No,Christianity,Short,Lee,1,1.772711
4568,38,48.536056,-71.31023,Mexican,No,Christianity,Short,Johnson,1,1.846639


In [7]:
input_data = pd.DataFrame([person_details])

In [8]:
input_data

Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Height,last_name
0,30,40.7128,-74.006,Mexican,No,Christianity,Short,Gomez


In [9]:
matches

Unnamed: 0,Age,latitude,longitude,Ethnicity,Diabetic,Religion,Height,last_name,Cluster,Distance
8198,45,38.421729,-73.406698,Mexican,No,Christianity,Short,Campbell,1,1.698346
4064,34,40.978783,-90.9302,Mexican,No,Christianity,Short,Lee,1,1.751818
4044,28,42.00293,-76.893765,Italian,No,Christianity,Short,Lee,1,1.753561
8181,35,40.032933,-77.641011,Irish,No,Christianity,Short,Lee,1,1.772711
4568,38,48.536056,-71.31023,Mexican,No,Christianity,Short,Johnson,1,1.846639


In [10]:
import os
os.chdir("..")

In [11]:
from src.components.match_finder import MatchFinderChatbot

chatbot = MatchFinderChatbot()



In [12]:
response = chatbot.find_matches(
    input_df=input_data,
    matched_df=matches
)

In [13]:
print(response)

The matched data with the last name 'Lee' is a good match for the input data of the deceased person with the last name 'Gomez' based on the following reasons:

1. Ethnicity: Both the input data and the matched data are of Mexican ethnicity, indicating a potential cultural similarity between the two individuals.

2. Diabetic Status: Both individuals are not diabetic, which could be an important factor in determining a suitable beneficiary for the insurance policy.

3. Religion: Both individuals follow Christianity, suggesting a shared religious background.

4. Height: Both individuals are categorized as having a 'Short' height, which could be a distinguishing factor in identifying a specific individual.

5. Age: Although there is a slight age difference, with the input data being 30 years old and the matched data being 34 years old, the age proximity is relatively close compared to other matches in the dataset.

Based on these key points, the person with the last name 'Lee' appears to b

In [None]:
  if st.button("Get the review from llm"):
            # from src.components.match_finder import MatchFinderChatbot
            # match_finder = MatchFinderChatbot()
            # input_data = st.session_state["training_data"]
            # matched_data = match_data
            # explanation = match_finder.find_best_match(input_data, matched_data)
            st.write(match_data)
        