In [1]:
import pandas as pd
import numpy as np
from surprise import SVD, KNNBasic, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import random

In [2]:
def set_seed(seed = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

In [3]:
contract_df = pd.read_csv(r'cleaned_data/CONTRACT.csv')
customer_df = pd.read_csv(r'cleaned_data/CUSTOMER.csv')
employee_df = pd.read_csv(r'filled_data/EMPLOYEE.csv')
leads_df = pd.read_csv(r'cleaned_data/LEADS.csv')

  contract_df = pd.read_csv(r'cleaned_data/CONTRACT.csv')
  customer_df = pd.read_csv(r'cleaned_data/CUSTOMER.csv')


In [4]:
customer_df.drop(columns=['NAME_EDUCATION_TYPE', 'DATE_BIRTH', 'CNT_CHILDREN'], inplace=True)

In [5]:
valid_sa = leads_df[leads_df['CODE_SA'] != -1]
lead_per_sa = (
    valid_sa
    .groupby('CODE_SA')
    .size()
    .reset_index(name='NUM_LEADS')
)

lead_per_sa

Unnamed: 0,CODE_SA,NUM_LEADS
0,12791,366
1,12797,231
2,12813,506
3,12892,25
4,12893,124
...,...,...
9605,121783,680
9606,121824,229
9607,121834,246
9608,121887,1


In [6]:
# employee_full = employee_df.merge(lead_per_sa, left_on='CODE_EMPLOYEE', right_on='CODE_SA', how='left')
# employee_full['NUM_LEADS'] = employee_full['NUM_LEADS'].fillna(0).astype(int)
# employee_full

In [7]:
final_df = contract_df.merge(customer_df, on='SKP_CLIENT', how='inner')
final_df = final_df.merge(employee_df, left_on='APPLY_EMPLOYEE', right_on='CODE_EMPLOYEE', how='inner')
final_df

Unnamed: 0,SKP_CREDIT_CASE,SKP_CLIENT,NAME_EDUCATION_TYPE,CNT_CHILDREN,AMT_INCOME_MAIN,AMT_INCOME_HOUSEHOLD,NAME_INCOME_TYPE,CODE_PROFESSION,NAME_CREDIT_STATUS,PRODUCT,...,LEAVING_DATE,MANAGER_CODE_EMPLOYEE,GENDER,BIRTH_DATE,LEVEL_SA,WORKING_DAYS,AGE_EMPLOYEE_HIRED,AGE_EMPLOYEE,SA_PROVINCE,FLAG_MANAGER
0,202732373,12665970,Bachelor's degree,0,7000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,2023-11-29,26872,0,1985-03-24,2,2969,30,40,43,0
1,149179096,14975594,Elementary (primary) school,1,4000000.0,0,Employed person,SALESMAN,Finished,Consumer Durable,...,2022-11-30,19780,0,1992-08-03,1,2800,22,32,31,1
2,335925533,126826775,XNA,0,2000000.0,0,XNA,XNA,Rejected,Consumer Durable,...,,889,0,1989-11-15,1,2633,28,35,52,0
3,163357328,38084937,Junior school education,0,5000000.0,0,Employed person,WORKER,Finished,Consumer Durable,...,2021-01-01,34590,0,1994-06-30,0,1473,22,30,56,0
4,193688464,37761042,Bachelor's degree,0,10000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,2021-04-13,1406,0,1992-07-10,0,1624,24,32,30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900896,342934673,99124783,Junior school education,1,25000000.0,0,Employed person,SALESMAN,Signed,Cash Loan,...,,114569,0,1988-06-01,3,1458,33,37,38,0
900897,342934791,6409762,Elementary (primary) school,2,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,,109522,1,2002-02-23,3,1311,19,23,5,0
900898,342935412,37676700,Elementary (primary) school,1,15000000.0,0,Employed person,WORKER,Signed,Two-wheelers,...,,1428,1,2000-08-17,0,261,24,24,52,0
900899,342935765,7514929,Junior school education,0,20000000.0,0,Self-employed person / business owner,ENGINEER,Approved,Cash Loan,...,,248,1,1991-04-22,2,268,33,34,38,0


In [8]:
def target_feature(row):
    if row['NAME_CREDIT_STATUS'] in ['In Preprocess', 'In Process']:
        return 'Processing'
    if row['NAME_CREDIT_STATUS'] in ['Signed', 'Active', 'Finished', 'Paid off', 'Written off', 'Sold']:
        return 'Signed'
    if row['NAME_CREDIT_STATUS'] == 'Approved':
        if pd.notna(row['SIGN_CONTRACT_TIME']):
            return 'Signed'
        return 'Processing'
    if row['NAME_CREDIT_STATUS'] == 'Rejected':
        return 'Rejected'
    if row['NAME_CREDIT_STATUS'] == 'Cancelled':
        if pd.notna(row['SIGN_CONTRACT_TIME']):
            return 'Signed'
        return 'Rejected'

final_df['SIGNED'] = final_df.apply(target_feature, axis=1)
final_df = final_df[final_df['SIGNED'] != 'Processing']
final_df

Unnamed: 0,SKP_CREDIT_CASE,SKP_CLIENT,NAME_EDUCATION_TYPE,CNT_CHILDREN,AMT_INCOME_MAIN,AMT_INCOME_HOUSEHOLD,NAME_INCOME_TYPE,CODE_PROFESSION,NAME_CREDIT_STATUS,PRODUCT,...,MANAGER_CODE_EMPLOYEE,GENDER,BIRTH_DATE,LEVEL_SA,WORKING_DAYS,AGE_EMPLOYEE_HIRED,AGE_EMPLOYEE,SA_PROVINCE,FLAG_MANAGER,SIGNED
0,202732373,12665970,Bachelor's degree,0,7000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,26872,0,1985-03-24,2,2969,30,40,43,0,Signed
1,149179096,14975594,Elementary (primary) school,1,4000000.0,0,Employed person,SALESMAN,Finished,Consumer Durable,...,19780,0,1992-08-03,1,2800,22,32,31,1,Signed
2,335925533,126826775,XNA,0,2000000.0,0,XNA,XNA,Rejected,Consumer Durable,...,889,0,1989-11-15,1,2633,28,35,52,0,Rejected
3,163357328,38084937,Junior school education,0,5000000.0,0,Employed person,WORKER,Finished,Consumer Durable,...,34590,0,1994-06-30,0,1473,22,30,56,0,Signed
4,193688464,37761042,Bachelor's degree,0,10000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,1406,0,1992-07-10,0,1624,24,32,30,0,Signed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900895,342934440,84727626,High school education,1,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,13475,0,1991-10-16,3,2038,28,33,59,0,Signed
900896,342934673,99124783,Junior school education,1,25000000.0,0,Employed person,SALESMAN,Signed,Cash Loan,...,114569,0,1988-06-01,3,1458,33,37,38,0,Signed
900897,342934791,6409762,Elementary (primary) school,2,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,109522,1,2002-02-23,3,1311,19,23,5,0,Signed
900898,342935412,37676700,Elementary (primary) school,1,15000000.0,0,Employed person,WORKER,Signed,Two-wheelers,...,1428,1,2000-08-17,0,261,24,24,52,0,Signed


In [9]:
approved_per_sa = (
    final_df
    .groupby('CODE_EMPLOYEE')['SIGNED']
    .value_counts()
    .unstack(fill_value=0)
    .reset_index()
)

approved_per_sa

SIGNED,CODE_EMPLOYEE,Rejected,Signed
0,12791,48,204
1,12797,50,200
2,12813,99,300
3,12892,36,197
4,12893,75,217
...,...,...,...
9273,121824,129,270
9274,121834,78,294
9275,121887,46,166
9276,121917,52,144


In [10]:
# employee_full = employee_full.merge(approved_per_sa, on='CODE_EMPLOYEE', how='left')
# employee_full.drop(columns=['CODE_SA'], inplace=True)
# employee_full = employee_full.fillna(0)
# employee_full

In [11]:
final_df = final_df.merge(lead_per_sa, left_on='CODE_EMPLOYEE', right_on='CODE_SA', how='left')
final_df = final_df.merge(approved_per_sa, on='CODE_EMPLOYEE', how='left')
final_df = final_df.fillna(0)
final_df

Unnamed: 0,SKP_CREDIT_CASE,SKP_CLIENT,NAME_EDUCATION_TYPE,CNT_CHILDREN,AMT_INCOME_MAIN,AMT_INCOME_HOUSEHOLD,NAME_INCOME_TYPE,CODE_PROFESSION,NAME_CREDIT_STATUS,PRODUCT,...,WORKING_DAYS,AGE_EMPLOYEE_HIRED,AGE_EMPLOYEE,SA_PROVINCE,FLAG_MANAGER,SIGNED,CODE_SA,NUM_LEADS,Rejected,Signed
0,202732373,12665970,Bachelor's degree,0,7000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,2969,30,40,43,0,Signed,23074.0,121.0,29,66
1,149179096,14975594,Elementary (primary) school,1,4000000.0,0,Employed person,SALESMAN,Finished,Consumer Durable,...,2800,22,32,31,1,Signed,20815.0,202.0,52,164
2,335925533,126826775,XNA,0,2000000.0,0,XNA,XNA,Rejected,Consumer Durable,...,2633,28,35,52,0,Rejected,37020.0,512.0,115,221
3,163357328,38084937,Junior school education,0,5000000.0,0,Employed person,WORKER,Finished,Consumer Durable,...,1473,22,30,56,0,Signed,31181.0,2.0,19,69
4,193688464,37761042,Bachelor's degree,0,10000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,1624,24,32,30,0,Signed,30150.0,14.0,22,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900810,342934440,84727626,High school education,1,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,2038,28,33,59,0,Signed,43825.0,687.0,62,221
900811,342934673,99124783,Junior school education,1,25000000.0,0,Employed person,SALESMAN,Signed,Cash Loan,...,1458,33,37,38,0,Signed,49032.0,2580.0,34,259
900812,342934791,6409762,Elementary (primary) school,2,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,1311,19,23,5,0,Signed,50676.0,565.0,74,220
900813,342935412,37676700,Elementary (primary) school,1,15000000.0,0,Employed person,WORKER,Signed,Two-wheelers,...,261,24,24,52,0,Signed,59017.0,242.0,7,21


In [12]:
final_df['CONVERSATION_RATE'] = final_df.apply(
    lambda row: row['Signed'] / row['NUM_LEADS'] if row['NUM_LEADS'] > 0 else 0,
    axis=1
).fillna(0)

final_df

Unnamed: 0,SKP_CREDIT_CASE,SKP_CLIENT,NAME_EDUCATION_TYPE,CNT_CHILDREN,AMT_INCOME_MAIN,AMT_INCOME_HOUSEHOLD,NAME_INCOME_TYPE,CODE_PROFESSION,NAME_CREDIT_STATUS,PRODUCT,...,AGE_EMPLOYEE_HIRED,AGE_EMPLOYEE,SA_PROVINCE,FLAG_MANAGER,SIGNED,CODE_SA,NUM_LEADS,Rejected,Signed,CONVERSATION_RATE
0,202732373,12665970,Bachelor's degree,0,7000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,30,40,43,0,Signed,23074.0,121.0,29,66,0.545455
1,149179096,14975594,Elementary (primary) school,1,4000000.0,0,Employed person,SALESMAN,Finished,Consumer Durable,...,22,32,31,1,Signed,20815.0,202.0,52,164,0.811881
2,335925533,126826775,XNA,0,2000000.0,0,XNA,XNA,Rejected,Consumer Durable,...,28,35,52,0,Rejected,37020.0,512.0,115,221,0.431641
3,163357328,38084937,Junior school education,0,5000000.0,0,Employed person,WORKER,Finished,Consumer Durable,...,22,30,56,0,Signed,31181.0,2.0,19,69,34.500000
4,193688464,37761042,Bachelor's degree,0,10000000.0,0,Employed person,OTHER,Finished,Consumer Durable,...,24,32,30,0,Signed,30150.0,14.0,22,88,6.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900810,342934440,84727626,High school education,1,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,28,33,59,0,Signed,43825.0,687.0,62,221,0.321689
900811,342934673,99124783,Junior school education,1,25000000.0,0,Employed person,SALESMAN,Signed,Cash Loan,...,33,37,38,0,Signed,49032.0,2580.0,34,259,0.100388
900812,342934791,6409762,Elementary (primary) school,2,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,...,19,23,5,0,Signed,50676.0,565.0,74,220,0.389381
900813,342935412,37676700,Elementary (primary) school,1,15000000.0,0,Employed person,WORKER,Signed,Two-wheelers,...,24,24,52,0,Signed,59017.0,242.0,7,21,0.086777


In [13]:
final_df.drop(columns=['SKP_CREDIT_CASE', 'APPLY_CONTRACT_TIME', 'APPROVE_CONTRACT_TIME', 'SIGN_CONTRACT_TIME', 'APPLY_EMPLOYEE', 'SIGN_EMPLOYEE', 'FIRST_DUE', 'SECOND_DUE', 'THIRD_DUE', 'FOURTH_DUE', 'HIRING_DATE', 'LEAVING_DATE', 'MANAGER_CODE_EMPLOYEE', 'BIRTH_DATE', 'CODE_SA'], inplace=True)
final_df

Unnamed: 0,SKP_CLIENT,NAME_EDUCATION_TYPE,CNT_CHILDREN,AMT_INCOME_MAIN,AMT_INCOME_HOUSEHOLD,NAME_INCOME_TYPE,CODE_PROFESSION,NAME_CREDIT_STATUS,PRODUCT,AMT_CREDIT,...,WORKING_DAYS,AGE_EMPLOYEE_HIRED,AGE_EMPLOYEE,SA_PROVINCE,FLAG_MANAGER,SIGNED,NUM_LEADS,Rejected,Signed,CONVERSATION_RATE
0,12665970,Bachelor's degree,0,7000000.0,0,Employed person,OTHER,Finished,Consumer Durable,5593000.0,...,2969,30,40,43,0,Signed,121.0,29,66,0.545455
1,14975594,Elementary (primary) school,1,4000000.0,0,Employed person,SALESMAN,Finished,Consumer Durable,3003000.0,...,2800,22,32,31,1,Signed,202.0,52,164,0.811881
2,126826775,XNA,0,2000000.0,0,XNA,XNA,Rejected,Consumer Durable,6890000.0,...,2633,28,35,52,0,Rejected,512.0,115,221,0.431641
3,38084937,Junior school education,0,5000000.0,0,Employed person,WORKER,Finished,Consumer Durable,3300000.0,...,1473,22,30,56,0,Signed,2.0,19,69,34.500000
4,37761042,Bachelor's degree,0,10000000.0,0,Employed person,OTHER,Finished,Consumer Durable,2895000.0,...,1624,24,32,30,0,Signed,14.0,22,88,6.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900810,84727626,High school education,1,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,80000000.0,...,2038,28,33,59,0,Signed,687.0,62,221,0.321689
900811,99124783,Junior school education,1,25000000.0,0,Employed person,SALESMAN,Signed,Cash Loan,30000000.0,...,1458,33,37,38,0,Signed,2580.0,34,259,0.100388
900812,6409762,Elementary (primary) school,2,15000000.0,0,Self-employed person / business owner,OTHER,Signed,Cash Loan,25000000.0,...,1311,19,23,5,0,Signed,565.0,74,220,0.389381
900813,37676700,Elementary (primary) school,1,15000000.0,0,Employed person,WORKER,Signed,Two-wheelers,18000000.0,...,261,24,24,52,0,Signed,242.0,7,21,0.086777


In [14]:
for col in final_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    final_df[col] = le.fit_transform(final_df[col].astype(str))
    print(f"Mapping for {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")
final_df.info()

Mapping for NAME_EDUCATION_TYPE: {"Bachelor's degree": 0, 'Elementary (primary) school': 1, 'High school education': 2, 'Junior school education': 3, "Master's degree": 4, 'XNA': 5}
Mapping for NAME_INCOME_TYPE: {'Employed person': 0, 'Person in household': 1, 'Retired person': 2, 'Self-employed person / business owner': 3, 'Student': 4, 'Unemployed': 5, 'XNA': 6}
Mapping for CODE_PROFESSION: {'ADMINISTRATIVE': 0, 'ENGINEER': 1, 'FARMER': 2, 'OTHER': 3, 'SALESMAN': 4, 'SERVICES': 5, 'WORKER': 6, 'XNA': 7}
Mapping for NAME_CREDIT_STATUS: {'Active': 0, 'Approved': 1, 'Cancelled': 2, 'Finished': 3, 'Paid off': 4, 'Rejected': 5, 'Signed': 6, 'Sold': 7, 'Written off': 8}
Mapping for PRODUCT: {'Cash Loan': 0, 'Consumer Durable': 1, 'Credit Card': 2, 'GM Paylater': 3, 'Insurance': 4, 'Two-wheelers': 5, 'XNA': 6}
Mapping for NAME_GENDER: {'Female': 0, 'Male': 1, 'XNA': 2}
Mapping for NAME_SALARY_FREQUENCY: {'Every week paid': 0, 'Irregular salary frequency': 1, 'Monthly paid': 2, 'Once a two w

In [15]:
history = (
    final_df
    .groupby(['SKP_CLIENT', 'CODE_EMPLOYEE'])['SIGNED']
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={0: 'Rejected', 1: 'Signed'})  # Đổi tên cột
    .reset_index()
)
history['SUCCESS_RATE'] = history['Signed'] / (history['Signed'] + history['Rejected'])
history

SIGNED,SKP_CLIENT,CODE_EMPLOYEE,Rejected,Signed,SUCCESS_RATE
0,45,23045,0,3,1.0
1,45,109314,0,1,1.0
2,477,40847,1,0,0.0
3,509,38412,0,2,1.0
4,528,113536,1,0,0.0
...,...,...,...,...,...
658361,127663671,59260,0,1,1.0
658362,127663684,59394,0,1,1.0
658363,127663698,58783,0,1,1.0
658364,127663746,33866,0,2,1.0


In [16]:
class SARecommender:
    def __init__(self, history, df, rating_scale=(0, 1), cbf_features_client=None, cbf_features_sa=None):
        # Historical data for Collaborative Filtering (CF)
        self.history_df = history[['SKP_CLIENT', 'CODE_EMPLOYEE', 'SUCCESS_RATE']].copy()
        self.reader = Reader(rating_scale=rating_scale)
        self.data = Dataset.load_from_df(self.history_df, self.reader)
        self.trainset, self.testset = train_test_split(self.data, test_size=0.2)

        # CF models
        self.svd = SVD()
        self.knn = None
        self.pred_svd = None
        self.pred_knn = None

        # Full data for Content-Based Filtering (CBF)
        self.df_all = df.copy()
        self.client_features = cbf_features_client
        self.sa_features = cbf_features_sa
        self.client_profile = None
        self.sa_profiles = None
        self.scaler = StandardScaler()

        # Prepare CBF profiles if feature columns are provided
        if cbf_features_client and cbf_features_sa:
            self._prepare_cbf_profiles()

    def _prepare_cbf_profiles(self):
        # Create and scale feature profiles for clients and sales agents
        self.client_profile = self.df_all.groupby('SKP_CLIENT')[self.client_features].first()
        self.sa_profiles = self.df_all.groupby('CODE_EMPLOYEE')[self.sa_features].first()

        self.client_scaler = StandardScaler()
        self.sa_scaler = StandardScaler()

        self.client_profile_scaled = pd.DataFrame(
            self.client_scaler.fit_transform(self.client_profile),
            index=self.client_profile.index,
            columns=self.client_profile.columns
        )
        self.sa_profiles_scaled = pd.DataFrame(
            self.sa_scaler.fit_transform(self.sa_profiles),
            index=self.sa_profiles.index,
            columns=self.sa_profiles.columns
        )

    def train_svd(self):
        # Train SVD model and evaluate
        self.svd.fit(self.trainset)
        self.pred_svd = self.svd.test(self.testset)
        print("✅ SVD RMSE:", accuracy.rmse(self.pred_svd, verbose=True))

    def train_knn(self, sim_name='pearson', user_based=False):
        # Train KNN model and evaluate
        sim_options = {'name': sim_name, 'user_based': user_based}
        self.knn = KNNBasic(sim_options=sim_options)
        self.knn.fit(self.trainset)
        self.pred_knn = self.knn.test(self.testset)
        print("✅ KNN RMSE:", accuracy.rmse(self.pred_knn, verbose=True))

    def combine_predictions_avg(self, w_svd=0.5, w_knn=0.5):
        # Combine SVD and KNN predictions using weighted average
        if self.pred_svd is None or self.pred_knn is None:
            raise ValueError("❌ You must train both SVD and KNN before combining.")
        combined = []
        for p_svd, p_knn in zip(self.pred_svd, self.pred_knn):
            assert p_svd.uid == p_knn.uid and p_svd.iid == p_knn.iid
            combined_est = w_svd * p_svd.est + w_knn * p_knn.est
            combined.append((p_svd.uid, p_svd.iid, combined_est))
        return pd.DataFrame(combined, columns=['SKP_CLIENT', 'CODE_EMPLOYEE', 'COMBINED_EST'])

    def combine_predictions_rank(self, top_n=3):
        # Combine SVD and KNN predictions by ranking and selecting top-N
        df_svd = pd.DataFrame([
            {'SKP_CLIENT': p.uid, 'CODE_EMPLOYEE': p.iid, 'est_svd': p.est}
            for p in self.pred_svd
        ])
        df_knn = pd.DataFrame([
            {'SKP_CLIENT': p.uid, 'CODE_EMPLOYEE': p.iid, 'est_knn': p.est}
            for p in self.pred_knn
        ])
        df = pd.merge(df_svd, df_knn, on=['SKP_CLIENT', 'CODE_EMPLOYEE'])
        df['rank_svd'] = df.groupby('SKP_CLIENT')['est_svd'].rank(ascending=False, method='min')
        df['rank_knn'] = df.groupby('SKP_CLIENT')['est_knn'].rank(ascending=False, method='min')
        df['total_rank'] = df['rank_svd'] + df['rank_knn']
        top_n_df = (
            df.sort_values(['SKP_CLIENT', 'total_rank'])
              .groupby('SKP_CLIENT')
              .head(top_n)
              .reset_index(drop=True)
        )
        return top_n_df[['SKP_CLIENT', 'CODE_EMPLOYEE', 'total_rank']]

    def recommend_for_client(self, client_id, top_n=10):
        # Recommend for known client using both CF and CBF (hybrid)
        if client_id not in self.df_all['SKP_CLIENT'].values:
            print(f"⚠️ Client {client_id} is unseen, fallback to CBF...")
            return self.recommend_by_cbf(client_id, top_n)

        if self.pred_svd is None or self.pred_knn is None:
            raise ValueError("❌ You must train both SVD and KNN before making recommendations.")
        if self.client_profile_scaled is None or self.sa_profiles_scaled is None:
            raise ValueError("❌ Client and SA features are missing for CBF.")

        all_employees = set(self.df_all['CODE_EMPLOYEE'].unique())
        seen_employees = set(self.history_df[self.history_df['SKP_CLIENT'] == client_id]['CODE_EMPLOYEE'].unique())
        unseen_employees = list(all_employees - seen_employees)

        # Get client's vector from CBF profile
        client_vector = self.client_profile_scaled.loc[[client_id]].values
        cbf_similarities = cosine_similarity(client_vector, self.sa_profiles_scaled.loc[unseen_employees].values)[0]

        preds = []
        for idx, emp in enumerate(unseen_employees):
            try:
                est_svd = self.svd.predict(client_id, emp).est
                est_knn = self.knn.predict(client_id, emp).est
                est_combined_cf = 0.5 * est_svd + 0.5 * est_knn
                est_cbf = cbf_similarities[idx]
                final_score = 0.5 * est_combined_cf + 0.5 * est_cbf  # adjustable weights
                preds.append((emp, est_svd, est_knn, est_cbf, final_score))
            except:
                continue

        pred_df = pd.DataFrame(preds, columns=['CODE_EMPLOYEE', 'SVD_EST', 'KNN_EST', 'CBF_SIM', 'FINAL_SCORE'])
        return pred_df.sort_values(by='FINAL_SCORE', ascending=False).head(top_n).reset_index(drop=True)

    def recommend_by_cbf(self, client_id, top_n=10, client_feature_row=None):
        # Recommend using only CBF – for new clients or fallback
        if self.client_profile_scaled is None or self.sa_profiles_scaled is None:
            raise ValueError("❌ You must provide both client and SA features for CBF.")

        if client_id in self.client_profile_scaled.index:
            client_vector = self.client_profile_scaled.loc[[client_id]].values
        elif client_feature_row is not None:
            # New client: require manual feature input
            client_vector = self.client_scaler.transform([client_feature_row])
        else:
            raise ValueError("❌ No feature information found for new client.")

        sims = cosine_similarity(client_vector, self.sa_profiles_scaled.values)[0]
        top_indices = np.argsort(sims)[::-1][:top_n]
        top_sas = self.sa_profiles_scaled.index[top_indices]
        return pd.DataFrame({
            'CODE_EMPLOYEE': top_sas,
            'SIMILARITY': sims[top_indices]
        })

In [17]:
# Define content-based features for clients
cbf_client_feats = [
    'NAME_GENDER',              # → Encoded gender from name
    'AGE_CLIENT',               # Client's age
    'CNT_CHILDREN',             # Number of children
    'AMT_INCOME_MAIN',          # Main income
    'CNT_PERSON_DEPENDENT',     # Number of dependents
    'AVG_SESSION_PER_WEEK_2025' # Activity level (proxy for engagement/experience)
]

# Define content-based features for sales agents
cbf_sa_feats = [
    'GENDER',           # Agent's gender
    'AGE_EMPLOYEE',     # Agent's age
    'LEVEL_SA',         # Level (proxy for skill/ability)
    'WORKING_DAYS',     # Work experience in days
    'FLAG_MANAGER',     # Management flag (proxy for responsibility)
    'LEVEL_SA'          # Repeated level or used as proxy again for experience
]

# Instantiate the recommender with CF and CBF features
recommender = SARecommender(
    history,              # Historical client-agent interactions
    final_df,             # Full dataset with features
    rating_scale=(0, 1),  # Scale of success rate
    cbf_features_client=cbf_client_feats,
    cbf_features_sa=cbf_sa_feats
)

# Train Collaborative Filtering models
recommender.train_svd()
recommender.train_knn()

RMSE: 0.4236
✅ SVD RMSE: 0.4235935664468589
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.4441
✅ KNN RMSE: 0.4441197559381672


In [18]:
# Recommendation for an existing client
top_SA = recommender.recommend_for_client(client_id=84954403)
print("✅ Recommendation for existing client:")
print(top_SA)


✅ Recommendation for existing client:
   CODE_EMPLOYEE   SVD_EST  KNN_EST   CBF_SIM  FINAL_SCORE
0          43769  0.866114  0.70953  0.926002     0.856912
1          39030  0.809503  0.70953  0.929646     0.844581
2          56222  0.802514  0.70953  0.923468     0.839745
3          39941  0.817022  0.70953  0.911750     0.837513
4          46345  0.798381  0.70953  0.907735     0.830845
5          43370  0.817343  0.70953  0.897886     0.830661
6          58204  0.852900  0.70953  0.879736     0.830476
7          43616  0.825176  0.70953  0.890806     0.829079
8          51971  0.738820  0.70953  0.933394     0.828784
9          46130  0.718400  0.70953  0.943526     0.828745


In [19]:
# Recommendation for a new client (cold start)
# Provide feature vector manually, matching cbf_client_feats order
client_feature_vector = [1, 30, 0, 15000000, 0, 5]
top_SA = recommender.recommend_by_cbf(client_id=101, client_feature_row=client_feature_vector)
print("✅ Recommendation for new client (cold start):")
print(top_SA)

✅ Recommendation for new client (cold start):
   CODE_EMPLOYEE  SIMILARITY
0          51769    0.645110
1          51382    0.645098
2          52474    0.645012
3          51020    0.644952
4          50951    0.644925
5          50670    0.644833
6          50171    0.644681
7          53960    0.644187
8          54437    0.643776
9          51774    0.643561


