## Environment Setting

In [1]:
#!pip install sentence-transformers

In [2]:
#!pip install xgboost

In [3]:
#!pip install imbalanced-learn

In [4]:
#!pip install tensorflow

In [50]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import tensorflow
from tensorflow import keras
from sentence_transformers import SentenceTransformer
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sentence_transformers import SentenceTransformer
from tqdm import tqdm

## Pipeline Construction

In [51]:
def collaborative_filtering_w(templates,cases):
    temp = templates.reset_index(drop = True)
    case = cases.reset_index(drop = True)
    # temp = templates
    # case = cases
    n = len(temp)
    m = len(case)
    #print('Number of template for district ',Corp, ' is ',n)
    #print('Number of cases for this district is ',m)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    df = pd.DataFrame(mat, columns = temp_list)
    
    for i in tqdm(range(m-1)):
        similarities = []
        for j in range(m-1):
            if i != j:
                similarity = np.dot(case.qq_embeddings.values[i],case.qq_embeddings.values[j].T).item()
                similarities.append((j,similarity))
        similarities.sort(key = lambda X:X[1],reverse = True)
        similarities = similarities[:10]
        for pair in similarities:
            if not np.isnan(case.TemplateId[pair[0]]):
                df.loc[i,case.TemplateId[pair[0]]] += pair[1]
    
    
    return df

In [52]:
def collaborative_filtering_w_corp(templates,cases,Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    n = len(temp)
    m = len(case)
    #print('Number of template for district ',Corp, ' is ',n)
    #print('Number of cases for this district is ',m)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    df = pd.DataFrame( mat, columns = temp_list)
    
    for i in tqdm(range(m-1)):
        similarities = []
        for j in range(m-1):
            if i != j:
                similarity = np.dot(case.qq_embeddings.values[i],case.qq_embeddings.values[j].T).item()
                similarities.append((j,similarity))
        similarities.sort(key = lambda X:X[1],reverse = True)
        similarities = similarities[:10]
        for pair in similarities:
            if not np.isnan(case.TemplateId[pair[0]]):
                df.loc[i,case.TemplateId[pair[0]]] += pair[1]
    
    
    return df

In [53]:
def jaccard_similarity_sentence(sentence1, sentence2):
    # Tokenize the sentences into words
    words1 = set(sentence1.split())
    words2 = set(sentence2.split())

    # Compute the intersection and union
    intersection = words1.intersection(words2)
    union = words1.union(words2)

    # Calculate Jaccard Similarity
    similarity = len(intersection) / len(union)
    
    return similarity

In [54]:
def jaccard_matrix(templates, cases):
    #temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    #case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    temp = templates
    case = cases
    n = len(temp)
    m = len(case)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    similarity_matrix = pd.DataFrame( mat, columns = temp_list)
    for case_id, case_body in tqdm(zip(case.index, case['cleaned_description'])):
        for template_id, template_body in zip(temp['TemplateId'], temp['cleaned_MessageBody']):
            similarity = jaccard_similarity_sentence(case_body, template_body)
            similarity_matrix.loc[case_id, template_id] = similarity
    return similarity_matrix

In [55]:
def jaccard_matrix_corp(templates, cases, Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    
    n = len(temp)
    m = len(case)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    similarity_matrix = pd.DataFrame( mat, columns = temp_list)
    for case_id, case_body in tqdm(zip(case.index, case['cleaned_description'])):
        for template_id, template_body in zip(temp['TemplateId'], temp['cleaned_MessageBody']):
            similarity = jaccard_similarity_sentence(case_body, template_body)
            similarity_matrix.loc[case_id, template_id] = similarity
    return similarity_matrix

In [56]:
def compute_similarity_matrix(templates, cases):

    #temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    #case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    
    temp = templates
    case = cases
    n = len(temp)
    m = len(case)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    similarity_matrix = pd.DataFrame( mat, columns = temp_list)
    for case_id, case_embedding in tqdm(zip(case.index, case['qa_embeddings'])):
        for template_id, template_embedding in zip(temp['TemplateId'], temp['embeddings']):
            similarity = np.dot(case_embedding, template_embedding.T).item()
            similarity_matrix.loc[case_id, template_id] = similarity
    return similarity_matrix

In [57]:
def compute_similarity_matrix_corp(templates, cases,Corp):

    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    n = len(temp)
    m = len(case)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    similarity_matrix = pd.DataFrame( mat, columns = temp_list)
    for case_id, case_embedding in tqdm(zip(case.index, case['qa_embeddings'])):
        for template_id, template_embedding in zip(temp['TemplateId'], temp['embeddings']):
            similarity = np.dot(case_embedding, template_embedding.T).item()
            similarity_matrix.loc[case_id, template_id] = similarity
    return similarity_matrix

In [58]:
def classification_df(templates,cases):
    #temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    #case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    temp = templates
    case = cases
    
    similarity_df = compute_similarity_matrix(temp, case)
    collaborative_df = collaborative_filtering_w(temp,case)
    jaccard_df = jaccard_matrix(temp,case)
    
    n = len(temp)
    m = len(case)
    temp_list = list(temp.TemplateId)
    
    mat = [[0 for _ in range(4)] for __ in range(m*n)]
    
    df = pd.DataFrame(mat,columns = ['similarity_score','collaborative_score','jaccard_score','match'])
    
    cnt = 0
    
    for i in tqdm(range(m)):
        for temp in temp_list:
            df.loc[cnt,'similarity_score'] = similarity_df.loc[i,temp]
            df.loc[cnt,'collaborative_score'] = collaborative_df.loc[i,temp]
            df.loc[cnt,'jaccard_score'] = jaccard_df.loc[i,temp]
            if (not np.isnan(case.TemplateId[i])) and case.TemplateId[i] == temp:
                df.loc[cnt,'match'] = 1
            cnt += 1
    return df

In [59]:
def classification_df_corp(templates,cases,Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    
    similarity_df = compute_similarity_matrix_corp(temp, case, Corp)
    collaborative_df = collaborative_filtering_w_corp(temp,case,Corp)
    jaccard_df = jaccard_matrix_corp(temp,case,Corp)
    
    n = len(temp)
    m = len(case)
    temp_list = list(temp.TemplateId)
    
    mat = [[0 for _ in range(4)] for __ in range(m*n)]
    
    df = pd.DataFrame(mat,columns = ['similarity_score','collaborative_score','jaccard_score','match'])
    
    cnt = 0
    
    for i in tqdm(range(m)):
        for temp in temp_list:
            df.loc[cnt,'similarity_score'] = similarity_df.loc[i,temp]
            df.loc[cnt,'collaborative_score'] = collaborative_df.loc[i,temp]
            df.loc[cnt,'jaccard_score'] = jaccard_df.loc[i,temp]
            if (not np.isnan(case.TemplateId[i])) and case.TemplateId[i] == temp:
                df.loc[cnt,'match'] = 1
            cnt += 1
    return df

In [60]:
def xgboost_performance(templates_train,templates_matched,cases_train,cases_test,corp):
    res_df_train = classification_df(templates_train, cases_train)
    X_train = res_df_train[['similarity_score', 'collaborative_score', 'jaccard_score']]
    y_train = res_df_train['match']
    res_df_test = classification_df_corp(templates_matched,cases_test,corp)
    X_test = res_df_test[['similarity_score', 'collaborative_score', 'jaccard_score']]
    y_test = res_df_test['match']
    
    scale_pos_weight_value = sum(y_train == 0) / sum(y_train == 1)
    
    model = xgb.XGBClassifier(objective='binary:logistic', 
                        use_label_encoder=False, 
                        eval_metric='logloss',
                        scale_pos_weight=scale_pos_weight_value)
    model.fit(X_train, y_train)
    probabilities = model.predict_proba(X_test)
    
    
    threshold = 0.5
    y_pred = (probabilities[:, 1] > threshold).astype(int)
    
    acc = accuracy_score(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
   
    print(f'The accuracy of recommendation for district {corp} is {acc}')
    print('Confusion matrix is', conf_mat)
    print('Classification report:', class_report)
    
    save_model(model,'xgb_finetune.joblib')
    return acc, conf_mat, class_report,  model

In [61]:
from joblib import dump, load
def save_model(model, filename):
    dump(model, filename)

## Dataset Split

#### Matched

In [85]:
template_matched = pd.read_parquet('template_matched_original_embeds.parquet')
case_matched = pd.read_parquet('case_matched_original_embeds.parquet')

In [86]:
comm = pd.read_csv('comm_clean.csv', encoding='ISO-8859-1')

In [87]:
case_matched[case_matched['CorpNo'] == 87168]

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
45354,2988576.0,87168.0,My sons chrome book is missing screws at the b...,5580.0,"Hello Amber sloan,Thank you for contacting YCS...","[0.1945439, -0.3965842, -0.3225592, 0.06263387...","[0.041880794, -0.03763687, 0.0007626698, -0.00..."
46883,3101356.0,87168.0,It be giving me a blue error code bruh,5579.0,Thank you for contacting YCSD IT. To best serv...,"[-0.18941243, -0.7439538, -0.28012222, 0.15651...","[-0.04435112, -0.106237836, 0.025605367, -0.02..."
52783,3639244.0,87168.0,"My name is Matthew (Wally) Atkins, and I am an...",5578.0,"Hello Wally Atkins,Thank you for contacting YC...","[0.15315941, -0.042141236, -0.02796632, 0.4381...","[0.024075689, 0.06203446, 0.0034855173, 0.0605..."
53600,3660236.0,87168.0,"From: Gary Porter Sent: Wednesday, November 2...",6908.0,The division is in receipt of your Freedom of ...,"[0.24029946, 0.13375609, 0.041198798, 0.214174...","[0.055929154, 0.065867625, 0.016963314, 0.0649..."
55373,3786777.0,87168.0,"Get From: C H Wednesday, January 18, 2023 1:18...",6908.0,"Good afternoon,The York County School Division...","[0.19229299, 0.37751985, -0.012165331, 0.29312...","[0.037749015, 0.09961883, 0.018189006, 0.05003..."
55711,3822414.0,87168.0,I am looking for any and all records pertainin...,6816.0,The division is in receipt of your Freedom of ...,"[0.28585905, 0.40384594, -0.0006786055, 0.2979...","[0.033634655, 0.1014754, 0.04145168, 0.0747476..."
55826,3827391.0,87168.0,"From: Shandor, Victor Sent: Monday, January 3...",6816.0,The division is in receipt of your Freedom of ...,"[0.23029499, 0.18329237, -0.14902958, 0.434772...","[0.021085978, 0.08477149, 0.025610175, 0.09052..."
56876,3892148.0,87168.0,"From: Sean McGoey Wednesday, March 01, 2023 1...",6816.0,"Good afternoon,The division is in receipt of y...","[0.16681203, 0.5552059, -0.07922218, 0.2313684...","[0.04442308, 0.10246484, 0.013235976, 0.016555..."


In [88]:
#template_matched, case_matched = filter_data(*read_data(templates, comm, cases))

In [89]:
template_matched.shape

(5971, 5)

In [90]:
case_matched.shape

(57257, 7)

In [91]:
template_matched.head(3)

Unnamed: 0,TemplateId,Name,MainCorpNo,cleaned_MessageBody,embeddings
0,1,title,11918,[[Click here]]Login into account hello whats u...,"[0.05928093, -0.40406442, -0.20824255, 0.23555..."
1,3,"ipsum pulvinar sit amet. Nulla tortor augue, ...",11918,Login in to account [[Click here]] Sed consect...,"[-0.044206396, -0.018609809, -0.3228146, 0.162..."
2,4,Generated 150 paragraphsffffffffffffffffffffff...,11918,"ndimentum turpis urna, eu facilisis leo rutru...","[0.12639701, -0.15336296, -0.2961759, 0.098391..."


In [92]:
def select_top_10_percent(df):  
    sorted_df = df['TemplateId'].value_counts().reset_index()
    sorted_df.columns = ['TemplateId', 'count']
    top_10_percent = sorted_df.head(round(0.1 * len(sorted_df)))
    return top_10_percent['TemplateId'].tolist()

# Group by corpno and apply the function to each district
corp_case_num = case_matched.groupby('CorpNo')['CaseID'].count().reset_index()
corp_case_num = corp_case_num.rename(columns={'CaseID': 'CaseID_count'})
corp_case_num = corp_case_num.sort_values(by='CaseID_count', ascending=False)
big_district = corp_case_num[corp_case_num.CaseID_count>=100]['CorpNo']
case_matched_f = case_matched[case_matched['CorpNo'].isin(big_district)]

selected_templates = case_matched_f.groupby('CorpNo').apply(select_top_10_percent)

template_id_select = [template_id for template_ids in selected_templates for template_id in template_ids]
len(template_id_select)

266

In [93]:
template_train = template_matched[template_matched['TemplateId'].isin(template_id_select)].reset_index(drop=True)

In [94]:
template_train.shape

(266, 5)

In [95]:
comm_filter= comm[comm['TemplateId'].isin(template_id_select)]
comm_filter.head(5)

Unnamed: 0.1,Unnamed: 0,CaseID,CaseCommunicationId,TemplateId,cleaned_message
25,2113624,725075.0,742288.0,1213.0,"Hello marchello minter, Thank you for taking t..."
37,2113644,725100.0,741621.0,1083.0,"Hello Carly,To prevent students from incidenta..."
59,2113690,725156.0,741788.0,1244.0,You've just received your first Splash Card an...
60,2113691,725157.0,741772.0,1244.0,You've just received your first Splash Card an...
122,2113783,725274.0,743237.0,1358.0,Thank you for contacting the Transportation De...


In [96]:
case_id_select = comm_filter['CaseID'].tolist()

In [97]:
case_test = case_matched[~case_matched['CaseID'].isin(case_id_select)].reset_index(drop=True)

In [98]:
case_train = case_matched[case_matched['CaseID'].isin(case_id_select)].reset_index(drop=True)
case_train.shape

(35744, 7)

In [99]:
case_train.head(3)

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
0,725075.0,47139.0,My name is marchello minter and I'm interested...,1213.0,"Hello marchello minter, Thank you for taking t...","[-0.17718153, -0.04370393, -0.1578665, -0.0286...","[-0.052830666, -0.0030610943, 0.019983372, -0...."
1,725100.0,41272.0,Request to join Classroom Teachers Group.,1083.0,"Hello Carly,To prevent students from incidenta...","[-0.04608875, -0.017934788, -0.34587896, 0.024...","[0.06652652, 0.033666395, -0.031127302, 0.0591..."
2,725156.0,44070.0,Preferred first name:Student ID:Building:Build...,1244.0,You've just received your first Splash Card an...,"[-0.07711973, -0.0409059, -0.2779362, -0.00663...","[0.0118431235, 0.02057332, -0.020808859, 0.000..."


In [100]:
case_test.shape

(21513, 7)

In [101]:
# train classification df
#res_df = classification_df(template_train, case_train) 

# test classification df 
res_df = classification_df_corp(template_matched,case_test, 45671)

1457it [00:17, 85.25it/s]
100%|██████████| 1456/1456 [00:16<00:00, 90.60it/s]
1457it [00:29, 49.48it/s]
100%|██████████| 1457/1457 [01:01<00:00, 23.58it/s]


In [103]:
res_df.to_parquet('classification_df_test_raw.parquet')

In [102]:
res_df.shape

(734328, 4)

#### Unmatched dataset

In [53]:
template_unmatched, case_unmatched = na_data(*read_data(templates, comm, cases))

In [169]:
case_unmatched.shape

(1190155, 7)

In [170]:
template_unmatched.shape

(5971, 5)

In [171]:
case_unmatched_44070 = case_unmatched[case_unmatched.CorpNo == 44070.0]
case_unmatched_44070.shape

(34328, 7)

## Model Training

In [35]:
corp_list = [44070.0]

In [31]:
import torch

# Check PyTorch Version
print("PyTorch Version: ", torch.__version__)

# Check GPU Availability
print("GPU ", "available (YES!)" if torch.cuda.is_available() else "not available :(")

# Get GPU Name
if torch.cuda.is_available():
    print("GPU name: ", torch.cuda.get_device_name(0))

PyTorch Version:  2.0.0+cu118
GPU  available (YES!)
GPU name:  Tesla T4


In [46]:
%%time
acc_xgb, conf_mat_xgb, class_report_xgb, XGB = xgboost_performance(template_train,template_matched,case_train,case_test,45671)


The accuracy of recommendation for district 45671 is 0.7137777668834635
Confusion matrix is [[523483 209388]
 [   793    664]]
Classification report:               precision    recall  f1-score   support

           0       1.00      0.71      0.83    732871
           1       0.00      0.46      0.01      1457

    accuracy                           0.71    734328
   macro avg       0.50      0.59      0.42    734328
weighted avg       1.00      0.71      0.83    734328

CPU times: user 7h 18min 54s, sys: 2min 11s, total: 7h 21min 5s
Wall time: 7h 17min 57s


In [75]:
case_matched[case_matched['CorpNo'] == 44070].head(1)

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
2,725114.0,44070.0,"Hi,The dining plan that my son wanted to enrol...",1241.0,Candi-I found Will Eichstaedt to be a freshman...,"[0.013114568, -0.40650102, -0.070455566, 0.207...","[-0.0005268336, 0.020311702, 0.018932767, -0.0..."


#### MLP

## Model Validation

In [124]:
def collaborative_filtering_test(templates,cases,case_test,Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    n = len(temp)
    m = len(case)
    p = len(case_test)
    
    mat = [[0 for _ in range(n)] for __ in range(p)]
    
    temp_list = list(temp.TemplateId)
    
    df = pd.DataFrame( mat, columns = temp_list)
    
    for i in range(p):
        similarities = []
        for j in range(m):
            similarity = np.dot(case.qq_embeddings[i],case.qq_embeddings[j].T).item()
            similarities.append((j,similarity))
        similarities.sort(key = lambda X:X[1],reverse = True)
        similarities = similarities[:10]
        for pair in similarities:
            if not np.isnan(case.TemplateId[pair[0]]):
                df.loc[i,case.TemplateId[pair[0]]] += pair[1]
    
    
    return df

In [125]:
def popularity_test(templates,cases,case_test,Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    n = len(temp)
    m = len(case)
    p = len(case_test)
    
    mat = [[0 for _ in range(n)] for __ in range(p)]
    
    temp_list = list(temp.TemplateId)
    
    df = pd.DataFrame( mat, columns = temp_list)
    
    freq_count = cases.TemplateId.value_counts()
    
    for i in range(p):
        for template in temp_list:
            if template in freq_count.keys():
                df.loc[i,template] = freq_count[template]
    return df 

In [126]:
def compute_similarity_matrix_test(templates, case_test, Corp):

    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    # case_test['qa_embeddings'] = case_test.cleaned_message.apply(lambda X: model.encode(X))
    
    n = len(temp)
    m = len(case_test)
    
    mat = [[0 for _ in range(n)] for __ in range(m)]
    
    temp_list = list(temp.TemplateId)
    
    similarity_matrix = pd.DataFrame( mat, columns = temp_list)
    for case_id, case_embedding in zip(case_test.index, case_test['qa_embeddings']):
        for template_id, template_embedding in zip(temp['TemplateId'], temp['embeddings']):
            similarity = np.dot(case_embedding, template_embedding.T).item()
            similarity_matrix.loc[case_id, template_id] = similarity
    return similarity_matrix

In [127]:
def classification_df_test(templates,cases,case_test,Corp):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    case = cases[cases.CorpNo == Corp].reset_index(drop = True)
    
    similarity_df = compute_similarity_matrix_test(temp, case_test, Corp)
    collaborative_df = collaborative_filtering_test(temp,case,case_test,Corp)
    popularity_df = popularity_test(temp,case,case_test,Corp)
    
    n = len(temp)
    m = len(case_test)
    temp_list = list(temp.TemplateId)
    
    mat = [[0 for _ in range(4)] for __ in range(m*n)]
    
    df = pd.DataFrame(mat,columns = ['similarity_score','collaborative_score','popularity_score','match'])
    
    cnt = 0
    
    for i in range(m):
        for temp in temp_list:
            df.loc[cnt,'similarity_score'] = similarity_df.loc[i,temp]
            df.loc[cnt,'collaborative_score'] = collaborative_df.loc[i,temp]
            df.loc[cnt,'popularity_score'] = popularity_df.loc[i,temp]
            if (not np.isnan(case.TemplateId[i])) and case.TemplateId[i] == temp:
                df.loc[cnt,'match'] = 1
            cnt += 1
    return df

In [128]:
def show_result(templates,cases,case_test,Corp, model):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    res_df = classification_df_test(templates,cases,case_test,Corp)
    X = res_df[['similarity_score','collaborative_score','popularity_score']]
    Y = res_df['match']
    
    model_logistic = model
    
    m = len(case_test)
    n = len(temp)
    temp_list = list(temp.TemplateId)
    
    probabilities = model_logistic.predict_proba(X)
    
    for i in range(m-1):
        if not np.isnan(case_test.loc[i,'TemplateId']):
            print('The true template for this case is',case_test.loc[i,'TemplateId'])
        cnt = 0
        print('The message for this case is '+ case_test.loc[i, 'cleaned_message'])
        prob_list = []
        for j in range(n*i,n*i+n):
            prob_list.append([probabilities[j][1],cnt])
            cnt += 1
        sorted_list = sorted(prob_list,key = lambda X:X[0],reverse =True)
        if n >=5:
            rec_list = sorted_list[:5]
        else:
            rec_list = sorted_list
        
        print('Recommended templates are:')
        for j in range(len(rec_list)):
            if rec_list[j][0]>=0.5:
                print('Template id :' , temp_list[rec_list[j][1]])
                print('Message body of this template is :', temp.loc[rec_list[j][1],'cleaned_MessageBody'])
        print()

In [130]:
def XGB_SHOW(templates,cases,case_test,Corp, model):
    temp = templates[templates.MainCorpNo == Corp].reset_index(drop = True)
    res_df = classification_df_test(templates,cases,case_test,Corp)
    X = res_df[['similarity_score','collaborative_score','popularity_score']]
    Y = res_df['match']
    
    model_xgb = model
    
    m = len(case_test)
    n = len(temp)
    temp_list = list(temp.TemplateId)

    probabilities = model_xgb.predict_proba(X)
    
    
    for i in range(m-1):
        if not np.isnan(case_test.loc[i,'TemplateId']):
            print('The true template for this case is: ',case_test.loc[i,'TemplateId'])
            print()
        cnt = 0
        print('The description for this case is: ', case_test.loc[i,'cleaned_description'])
        print()
        print('The message for this case is '+ case_test.loc[i, 'cleaned_message'])
        prob_list = []
        for j in range(n*i,n*i+n):
            prob_list.append([probabilities[j][1],cnt])
            cnt += 1
        sorted_list = sorted(prob_list,key = lambda X:X[0],reverse =True)
        if n >=5:
            rec_list = sorted_list[:5]
        else:
            rec_list = sorted_list
        print()
        print('Recommended templates are:')
        for j in range(len(rec_list)):
            if rec_list[j][0]>=0.5:
                print('Template id :' , temp_list[rec_list[j][1]])
                print('Message body of this template is :', temp.loc[rec_list[j][1],'cleaned_MessageBody'])
        print()

#### matched test

In [41]:
#XGB_SHOW(template_matched, case_matched, case_test_44070, 44070, XGB)

In [131]:
import joblib
xgb = joblib.load('xgb_general_model.joblib')

In [58]:
case_test.head(1)

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
0,725114.0,44070.0,"Hi,The dining plan that my son wanted to enrol...",1241.0,Candi-I found Will Eichstaedt to be a freshman...,"[0.013114568, -0.40650102, -0.070455566, 0.207...","[-0.0005268336, 0.020311702, 0.018932767, -0.0..."


In [150]:
case_train_44070= case_train[case_train['CorpNo'] == 44070]

In [162]:
case_train_44070['cleaned_message'].iloc[0]

"You've just received your first Splash Card and now you're official! Know that we're always here for you. You can find out about Splash Cards, dining, service/repair requests, parking, shuttles and more on our website, website. J.C. Paciera Tulane University Campus Services "

In [163]:
case_train_44070.sample(10)

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
1582,826467.0,44070.0,Preferred first name: AllyStudent ID: 36100946...,1244.0,You've just received your first Splash Card an...,"[-0.024660012, 0.0066581448, -0.28968897, -0.1...","[0.026709707, 0.074973024, -0.022041367, 0.024..."
16334,1663263.0,44070.0,"External Sender. Be aware of links, attachmen...",3699.0,We have processed your request. You will recei...,"[0.05878132, 0.120848715, -0.13129868, 0.14687...","[0.0020468635, 0.062818274, -0.0073692505, 0.0..."
2578,846089.0,44070.0,Preferred first name: FionaStudent ID: 4170082...,1242.0,"Replacing your Splash Card is never fun, but I...","[-0.009737069, 0.038736753, -0.30248937, -0.12...","[0.030020628, 0.0827849, -0.029796092, 0.02042..."
1648,826780.0,44070.0,Preferred first name: AnnaStudent ID: 64300275...,1238.0,"Anna,I hope we were able to answer all of your...","[0.06652494, 0.055299245, -0.21983418, -0.1023...","[0.03236253, 0.076892324, -0.022873305, 0.0164..."
31777,3539195.0,44070.0,Preferred first name: EmilyStudent ID: 9820065...,5887.0,Thank you for reaching out to us. Your request...,"[0.08311005, -0.14021543, -0.2719212, -0.09266...","[0.0448784, 0.0528311, -0.025845824, 0.0149871..."
1461,825836.0,44070.0,Preferred first name: CindyStudent ID: 7020098...,1244.0,You've just received your first Splash Card an...,"[0.15432195, 0.08601062, -0.28310993, -0.14506...","[0.040771257, 0.08666819, -0.032497272, 0.0030..."
33765,3688596.0,44070.0,Preferred first name: Student ID: 692007591Bui...,6866.0,Thank you for reaching out to us. Your request...,"[-0.0074237385, -0.08530132, -0.24353506, -0.0...","[0.02532082, 0.0485581, -0.018456843, 0.038095..."
1756,827959.0,44070.0,Preferred first name: GraceStudent ID: 7210062...,1244.0,You've just received your first Splash Card an...,"[0.1331232, 0.07188691, -0.22165138, -0.100756...","[0.03335717, 0.07120075, -0.02222371, 0.032460..."
35399,3890409.0,44070.0,Preferred first name: FanyuStudent ID: 6160087...,6866.0,Thank you for reaching out to us. Your request...,"[0.19498648, -0.05748049, -0.22221345, -0.0294...","[0.055917647, 0.06294778, -0.024261052, 0.0393..."
17763,1714119.0,44070.0,"External Sender. Be aware of links, attachment...",3699.0,We have processed your request. You will recei...,"[0.28523684, -0.1044273, -0.12594585, 0.191631...","[0.061587535, 0.023655597, -0.014516805, 0.069..."


In [151]:
sampled_df = case_train_44070.drop_duplicates(subset='TemplateId').sample(n=10).reset_index(drop = True)

print(type(sampled_df))

ValueError: Cannot take a larger sample than population when 'replace=False'

In [152]:
sampled_df

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
0,3547504.0,44070.0,"External Sender. Be aware of links, attachment...",6708.0,"Mr. Veal, thank you for sharing this informati...","[0.1725361, -0.45340288, -0.12364495, 0.497997...","[0.017161429, -0.00061248854, 0.00482166, 0.01..."
1,861061.0,44070.0,Please see the attached files for bulletin po...,1544.0,Meredith-Here is the returned item.PSIf this i...,"[0.28978765, 0.19522053, -0.27031404, -0.06790...","[0.046535607, 0.029665003, -0.019016046, -0.02..."
2,802594.0,44070.0,Preferred first name: AlexanderStudent ID: 757...,1240.0,Thank you for purchasing your parking pass fro...,"[-0.033015944, -0.010021959, -0.26270372, -0.1...","[0.0052503073, 0.07761569, -0.005307954, 0.018..."
3,3431536.0,44070.0,"Hello,I am trying to add Jacob and Caroline, c...",6370.0,"Good Afternoon Dr. Fusco,Please have Caroline ...","[0.24274571, -0.06987252, -0.09810126, 0.25783...","[0.05732021, 0.005416873, 0.029247763, 0.03458..."
4,2943582.0,44070.0,Preferred first name: DiegoStudent ID: 6900008...,5731.0,Thank you for reaching out to us. Your request...,"[-0.1903193, 0.28985512, -0.22970034, -0.31545...","[0.012159014, 0.07850253, -0.020122932, -0.003..."
5,729462.0,44070.0,"From: Zion, Abigail M Sent: Monday, July 8, 2...",1236.0,"Good morning Abigail,Please see attached websi...","[0.0013764423, -0.2987635, -0.1479643, 0.13562...","[0.032569185, 0.005998171, 0.011326579, 0.0203..."
6,1163977.0,44070.0,"Hello, During this time of quarantine, have t...",2194.0,"We are thankful for your patience, and hope yo...","[0.20786773, 0.09958944, -0.20773877, 0.118577...","[0.01608505, 0.09157507, -0.012007482, 0.04348..."
7,3087862.0,44070.0,Preferred first name: DavidStudent ID: 6600038...,5920.0,"Thank you for reaching out, however we are una...","[-0.058427304, -0.042480763, -0.23311795, -0.0...","[0.020503402, 0.046240035, -0.022864243, 0.023..."
8,3608763.0,44070.0,"To whom it may concern,I am writing to see if ...",6546.0,"Good Morning,Depending on the purpose of your ...","[0.11071318, 0.2979945, -0.1006366, -0.0918096...","[0.035488095, 0.008190261, 0.007380249, -0.033..."
9,3107932.0,44070.0,Preferred first name: Student ID: 184003455Bui...,6161.0,Thank you for reaching out to us. Three swipes...,"[-0.049000986, -0.17651719, -0.23717113, -0.02...","[0.031227551, 0.029107166, -0.018912284, 0.036..."


In [177]:
#XGB_SHOW(template_matched,case_matched,sampled_df,44070,xgb)

In [178]:
#sampled_df.head()

In [95]:
comm[comm['CaseID'] == 1485587.0]['cleaned_message'].iloc[0]

'We are in the process of developing guidelines for band and band camps. The band directors at your school will share this information with you after they get it. We are still offering marching band next year in the traditional and mySchoolOnline model. We will have some social distancing requirements, cleaning procedures and masks (when necessary) for all extra-curricular activities and will be sharing additional details and specific course information soon. Melissa Musselwhite Director Pasco County Schools '

In [98]:
templates[templates['TemplateId'] == 3030]['cleaned_MessageBody'].iloc[0]

'We are in the process of developing guidelines for band and band camps. The band directors at your school will share this information with you after they get it. We are still offering marching band next year in the traditional and mySchoolOnline model. We will have some social distancing requirements, cleaning procedures and masks (when necessary) for all extra-curricular activities and will be sharing additional details and specific course information soon.'

#### Unmatched test

In [179]:
case_unmatched_45671 = case_unmatched[case_unmatched.CorpNo == 45671.0]
case_unmatched_45671.shape

(56830, 7)

In [180]:
case_unmatched_45671.head()

Unnamed: 0,CaseID,CorpNo,cleaned_description,TemplateId,cleaned_message,qa_embeddings,qq_embeddings
44,725142.0,45671.0,"Hi,We just received notification that our son,...",,"Good Morning Alice, I will answer your questio...","[0.3852101, -0.30660906, -0.20506588, 0.474652...","[0.04176847, -0.04396613, -0.015927736, 0.0357..."
164,725428.0,45671.0,"Hello,Allow me to introduce myself my name is ...",,"Hi Edgar,Thank you for reaching out to the Sup...","[0.01324001, 0.09639303, -0.19524464, 0.172282...","[3.7132206e-05, 0.058761757, -0.03216051, 0.04..."
182,725479.0,45671.0,"Hello,I have tried leaving messages and sent e...",,"Hello Ms. Small,Thank you for reaching out. So...","[-0.13388388, 0.1499397, -0.2746919, -0.036416...","[0.02632579, 0.0837156, -0.021776514, -0.04621..."
281,725754.0,45671.0,I wanted to make you aware - though I hope you...,,Thank you for contacting the Superintendent's ...,"[0.107691124, 0.035338853, -0.20305988, 0.1992...","[0.04758859, 0.069294, 0.012852099, 0.03252831..."
299,725791.0,45671.0,"Hi,I have been trying to call enrollment regar...",,"Hello Nghia, I sent your email to Enrollment f...","[-0.1405944, -0.5490448, -0.24499018, 0.015286...","[-0.024651533, -0.066694155, 0.006801235, -0.0..."


In [181]:
sample = case_unmatched_45671.sample(10).reset_index(drop = True)

In [182]:
XGB_SHOW(template_matched,case_matched,sample,45671,xgb)

The description for this case is:   From: Menzel, Heather Sent: Tuesday, August 23, 2022 9:27 AMTo: Sheridan, Rebecca Z HR Re: Clock hours Hi Rebecca, I have forwarded your question to the SPS HR department who may be able to help you with your request. They may have more details about the process of requesting your transcripts as a former employee. Cheers, Heather Menzel (nee Murdoch)Professional Development Program SpecialistProfessional Growth & Educator Support WebsiteFrom: Sheridan, Rebecca Z 

The message for this case is Hi Rebecca,Attached is an official transcript of all your SPS clock hours that we have recorded.Thanks, Mary Swanson HR Analyst, Compensation Seattle Public Schools 

Recommended templates are:
Template id : 5718
Message body of this template is : Hello, Thank you for reaching out! This is SPS Customer Service. We wanted to let you know that we have received your message regarding your student's enrollment, and Admissions will be responding to you as soon as pos

In [None]:
#show_result(template_matched, case_matched,, 44070, best_LR_05)

In [32]:
def load_model(filename):
    return load(filename)

In [35]:
best_LR_03 = load_model('best_logistic_model_44070.joblib')

In [None]:
model_general

In [69]:
case_unmatched_44070_sample = case_unmatched_44070.head(5)

In [70]:
case_unmatched_44070_sample = case_unmatched_44070_sample.reset_index(drop = True)

In [16]:
#XGB_SHOW(template_matched, case_matched, case_unmatched_44070_sample, 44070, XGB)

In [65]:
case_unmatched_44070_sample = case_unmatched_44070_sample.reset_index(drop = True)

In [17]:
#show_result(template_unmatched, case_unmatched, case_unmatched_44070_sample, 44070, best_LR_03)