In [105]:
import pandas as pd
from tqdm import tqdm

In [106]:
# load data
rating_data = pd.read_csv('data/rating_BETA_Jan.csv', encoding='utf-8-sig')
user_data = pd.read_csv('data/user_feature_BETA_Jan.csv', encoding='utf-8-sig')
mat_data = pd.read_csv('data/material_feature_BETA_Jan.csv', encoding='utf-8-sig')
review_data = pd.read_csv('data/review_BETA_Jan.csv', encoding='utf-8-sig')

In [107]:
# only materialpointsCNT == 1 in rating_data
# w/o repeat mat score in rating_data
rating_data = rating_data[rating_data['materialpointsCNT'] == 1]
uid_list = list(set(rating_data['client_sn']))
rating_data_wo_repeat = list()
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    if len(set(dat['material_points'])) > 1:
        rating_data_wo_repeat.append(dat)
rating_data = pd.concat(rating_data_wo_repeat).reset_index(drop=True) 

100%|██████████| 35288/35288 [00:30<00:00, 1160.06it/s]


In [108]:
rating_data.head(1)

Unnamed: 0,client_sn,MaterialID,con_sn,session_sn,MaterialType,PurchaseBrandID,attend_level,attend_date,sestime,week,materialpointsCNT,material_points,consultantpointsCNT,consultant_points
0,10354714,118145,28851.0,2021010422472454,Adult,1,11,2021-01-04,22,1,1,10,1,9


In [109]:
# add label feature to rating_data
import numpy as np
rating_data['label'] = [np.nan for _ in range(rating_data.shape[0])]
uid_list = list(set(rating_data['client_sn']))
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    index = dat.index
    score_list = list(dat['material_points'])
    max_score = max(score_list)
    label_list = []
    for score in score_list:
        if score == max_score:
            label_list.append(1)
        else:
            label_list.append(0)
    rating_data.loc[index, 'label']   = label_list

100%|██████████| 17907/17907 [00:37<00:00, 480.30it/s]


In [110]:
# select 'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label' as feature in rating_data
rating_data = rating_data[['client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label']]

In [111]:
rating_data.head(1)

Unnamed: 0,client_sn,MaterialID,session_sn,PurchaseBrandID,attend_level,attend_date,label
0,10354714,118145,2021010422472454,1,11,2021-01-04,1.0


In [112]:
# build rating_review_data by merging rating_data, review_data. where key = ['client_sn','MaterialID','session_sn'] (left join)
rating_review_data = pd.merge(rating_data, review_data, on=['client_sn','MaterialID','session_sn'], how='left')

In [113]:
rating_review_data.head(1)

Unnamed: 0,client_sn,MaterialID,session_sn,PurchaseBrandID,attend_level,attend_date,label,con_sn,compliment_INT,compliment_PRA,compliment_COR,complaint_DFG,complaint_DFV,complaint_EAG,complaint_EAV,complaint_BOR,complaint_OFA,complaint_ECA,complaint_ECR,complaint_ECV,complaint_ICA,complaint_ICR,complaint_ICV
0,10354714,118145,2021010422472454,1,11,2021-01-04,1.0,28851.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
# collect complain, compliment mat feature
mat_individual_col = list(set(rating_review_data.columns)-{'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','material_points','con_sn','label','attend_date'})
mat_individual_dat = rating_review_data.groupby(['MaterialID']).mean()[mat_individual_col]

In [115]:
mat_individual_dat.head(1)

Unnamed: 0_level_0,complaint_ECV,complaint_DFG,complaint_EAG,complaint_DFV,complaint_OFA,complaint_ECR,complaint_ICR,complaint_EAV,compliment_PRA,complaint_ECA,complaint_ICV,compliment_INT,compliment_COR,complaint_ICA,complaint_BOR
MaterialID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100059,0.0,0.0,0.036585,0.012195,0.012195,0.02439,0.036585,0.012195,0.341463,0.0,0.02439,0.54878,0.195122,0.02439,0.036585


In [116]:
# build rating_matF_data by merging rating_data, mat_individual_dat
rating_matF_data = pd.merge(rating_data, mat_individual_dat, on=['MaterialID'], how='left')

In [117]:
# sperate adult, jr data by PurchaseBrandID
rating_matF_data_AD = rating_matF_data[rating_matF_data['PurchaseBrandID']==1]
rating_matF_data_Jr = rating_matF_data[rating_matF_data['PurchaseBrandID']!=1]
rating_matF_data_AD = rating_matF_data_AD[list(set(rating_matF_data_AD.columns)-{'session_sn','PurchaseBrandID'})]
rating_matF_data_Jr = rating_matF_data_Jr[list(set(rating_matF_data_Jr.columns)-{'session_sn','PurchaseBrandID'})]

In [118]:
# sperate train, test data by attend_date
start_date = '2021-01-01'
train_date = '2021-04-01'
end_date = '2021-05-01'
train_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= start_date) & (rating_matF_data_AD['attend_date'] < train_date)]
test_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= train_date) & (rating_matF_data_AD['attend_date'] < end_date)]
train_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= start_date) & (rating_matF_data_Jr['attend_date'] < train_date)]
test_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= train_date) & (rating_matF_data_Jr['attend_date'] < end_date)]

In [92]:
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'attend_date'})]
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'attend_date'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'attend_date'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'attend_date'})]

In [93]:
label_AD = np.array(train_data_AD['label'])
label_Jr = np.array(train_data_Jr['label'])
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'label'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'label'})]
ground_truth_AD = np.array(test_data_AD['label'])
ground_truth_Jr = np.array(test_data_Jr['label'])
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'label'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'label'})]

In [94]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
def train_model(train_data,label):
    train_data = np.array(train_data)
    rf = RandomForestRegressor()
    rf.fit(train_data, label)
    return rf

In [95]:
model_AD = train_model(train_data=train_data_AD, label=label_AD)
model_Jr = train_model(train_data=train_data_Jr, label=label_Jr)



In [96]:
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
def predict_score(pred_prob, Y_test_array, binary_threshold=0.5):
    pred_one_hot = list()
    for i in range(pred_prob.shape[0]):
        if pred_prob[i] >= binary_threshold:
            pred_one_hot.append(1)
        else:
            pred_one_hot.append(0)
    print(metrics.classification_report(list(Y_test_array), pred_one_hot))
    print('---------------------------------------')
    print('Confusion Matrix')
    print(np.transpose(confusion_matrix(list(Y_test_array), pred_one_hot).T))
    print('---------------------------------------')
    print('positive label : 1 | negative label : 0')

In [98]:
test_data_AD_array = np.array(test_data_AD)
y_pred_AD = model_AD.predict(test_data_AD_array)
predict_score(y_pred_AD,ground_truth_AD)

              precision    recall  f1-score   support

         0.0       0.43      0.32      0.36     11923
         1.0       0.65      0.75      0.70     20223

    accuracy                           0.59     32146
   macro avg       0.54      0.53      0.53     32146
weighted avg       0.57      0.59      0.57     32146

---------------------------------------
Confusion Matrix
[[ 3762  8161]
 [ 4977 15246]]
---------------------------------------
positive label : 1 | negative label : 0


In [101]:
ground_truth_AD_list = list(ground_truth_AD)
pos,neg = 0,0
for val in ground_truth_AD_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_AD))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_AD_list)

              precision    recall  f1-score   support

         0.0       0.37      0.50      0.43     11923
         1.0       0.63      0.50      0.56     20223

    accuracy                           0.50     32146
   macro avg       0.50      0.50      0.49     32146
weighted avg       0.53      0.50      0.51     32146

---------------------------------------
Confusion Matrix
[[ 5960  5963]
 [10105 10118]]
---------------------------------------
positive label : 1 | negative label : 0


In [100]:
test_data_Jr_array = np.array(test_data_Jr)
y_pred_Jr = model_Jr.predict(test_data_Jr_array)
predict_score(y_pred_Jr,ground_truth_Jr)

              precision    recall  f1-score   support

         0.0       0.37      0.27      0.31     12373
         1.0       0.67      0.76      0.71     23560

    accuracy                           0.59     35933
   macro avg       0.52      0.52      0.51     35933
weighted avg       0.57      0.59      0.57     35933

---------------------------------------
Confusion Matrix
[[ 3337  9036]
 [ 5570 17990]]
---------------------------------------
positive label : 1 | negative label : 0


In [102]:
ground_truth_Jr_list = list(ground_truth_Jr)
pos,neg = 0,0
for val in ground_truth_Jr_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_Jr_list))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_Jr_list)

              precision    recall  f1-score   support

         0.0       0.35      0.50      0.41     12373
         1.0       0.66      0.50      0.57     23560

    accuracy                           0.50     35933
   macro avg       0.50      0.50      0.49     35933
weighted avg       0.55      0.50      0.51     35933

---------------------------------------
Confusion Matrix
[[ 6208  6165]
 [11709 11851]]
---------------------------------------
positive label : 1 | negative label : 0
