In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Existing dataset
data = pd.DataFrame({
    'first_name': ['John', 'Jane', 'Alice'],
    'last_name': ['Doe', 'Smith', 'Johnson'],        
    'email': ['john.doe@example.com', 'jane.smith@example.com', 'alice.j@example.com'],
    'postcode': ['12345', '23456', '34567'],
    'birthdate': ['01/01/1990', '02/02/1985', '03/03/1992']
})

# Function to combine DataFrame rows into a single string
def combine_rows(df):
    return df.apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Ask user for input
user_input = {
    'first_name': input("Enter First Name: "),
    'last_name': input("Enter Last Name: "),
    'email': input("Enter Email Address: "),
    'postcode': input("Enter Postal Code: "),
    'birthdate': input("Enter Date of Birth (DD/MM/YYYY): ")
}
user_data = ' '.join(user_input.values())

# Combine existing data for comparison
combined_data = combine_rows(data)

# Vectorize and calculate similarity
vectorizer = TfidfVectorizer()
vectorized_data = vectorizer.fit_transform(combined_data)
user_vector = vectorizer.transform([user_data])
similarity_scores = cosine_similarity(user_vector, vectorized_data).flatten()

# Find potential match
max_similarity = max(similarity_scores)
similarity_percentage = max_similarity * 100  # Convert to percentage

if max_similarity > 0.7:
    match_index = similarity_scores.argmax()
    print(f"Potential record match with a similarity of {similarity_percentage:.2f}%:")
    print(data.iloc[match_index])
else:
    print(f"No record match. Highest similarity: {similarity_percentage:.2f}%.")


Enter First Name:  John
Enter Last Name:  Doe
Enter Email Address:  john.d@example.com
Enter Postal Code:  12345
Enter Date of Birth (DD/MM/YYYY):  01/01/1990


Potential record match with a similarity of 96.84%:
first_name                    John
last_name                      Doe
email         john.doe@example.com
postcode                     12345
birthdate               01/01/1990
Name: 0, dtype: object
