In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# load data
rating_data = pd.read_csv('data/rating_BETA_Jan.csv', encoding='utf-8-sig')
user_data = pd.read_csv('data/user_feature_BETA_Jan.csv', encoding='utf-8-sig')
mat_data = pd.read_csv('data/material_feature_BETA_Jan.csv', encoding='utf-8-sig')
review_data = pd.read_csv('data/review_BETA_Jan.csv', encoding='utf-8-sig')

In [3]:
# only materialpointsCNT == 1 in rating_data
# w/o repeat mat score in rating_data
rating_data = rating_data[rating_data['materialpointsCNT'] == 1]
uid_list = list(set(rating_data['client_sn']))
rating_data_wo_repeat = list()
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    if len(set(dat['material_points'])) > 1:
        rating_data_wo_repeat.append(dat)
rating_data = pd.concat(rating_data_wo_repeat).reset_index(drop=True) 

100%|██████████| 35288/35288 [00:30<00:00, 1143.62it/s]


In [4]:
# add label feature to rating_data
import numpy as np
rating_data['label'] = [np.nan for _ in range(rating_data.shape[0])]
uid_list = list(set(rating_data['client_sn']))
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    index = dat.index
    score_list = list(dat['material_points'])
    max_score = max(score_list)
    label_list = []
    for score in score_list:
        if score == max_score:
            label_list.append(1)
        else:
            label_list.append(0)
    rating_data.loc[index, 'label']   = label_list

100%|██████████| 17907/17907 [00:38<00:00, 470.92it/s]


In [5]:
# select 'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label' as feature in rating_data
rating_data = rating_data[['client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label']]

In [6]:
# build rating_review_data by merging rating_data, review_data. where key = ['client_sn','MaterialID','session_sn'] (left join)
rating_review_data = pd.merge(rating_data, review_data, on=['client_sn','MaterialID','session_sn'], how='left')

In [7]:
def transform_date_to_age(date_str, categorical=True):
    if date_str != 'None':
        age_val = 2021 - pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S').year
        if categorical is False:
            return age_val
        else:
            if age_val <= 30:
                return '0~30'
            elif age_val > 30 and age_val < 50:
                return '30~50'
            else:
                return '50~'   
    else:
        return 'None'

In [8]:
user_data = user_data.fillna('None')
user_data['Client_Sex'].replace('N','None')
user_data['birthday'] = user_data['birthday'].apply(lambda x: transform_date_to_age(x))
user_data['JobClassName'].replace('Undefined','None')
user_data['IndustryClassName'].replace('Undefined','None')
user_data = user_data[['client_sn','Client_Sex','birthday','education','JobClassName','IndustryClassName']]

In [9]:
rating_review_data_with_UF = pd.merge(rating_review_data, user_data, on=['client_sn'], how='left')

In [10]:
rating_data_with_UF = pd.merge(rating_data, user_data, on=['client_sn'], how='left')

In [14]:
mat_individual_col = list(set(rating_review_data_with_UF.columns)-{'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','material_points','con_sn','label','attend_date','Client_Sex','birthday','education','JobClassName','IndustryClassName'})
mat_individual_dat = rating_review_data_with_UF.groupby(['MaterialID','Client_Sex','birthday','education','JobClassName','IndustryClassName']).mean()[mat_individual_col]

In [16]:
mat_individual_dat.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,complaint_EAV,complaint_EAG,compliment_INT,complaint_DFG,compliment_COR,compliment_PRA,complaint_ICR,complaint_ICV,complaint_ICA,complaint_BOR,complaint_OFA,complaint_ECV,complaint_ECA,complaint_ECR,complaint_DFV
MaterialID,Client_Sex,birthday,education,JobClassName,IndustryClassName,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100059,F,0~30,,Undefined,Undefined,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,,General Staff,Marketing and Sales,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,,Undefined,Undefined,0.0,0.0,0.428571,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,other-code:0,Undefined,Undefined,0.0,0.0,0.75,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,大专/**/專科/大學,General Staff,Manufacturing and retailing,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,大专/**/專科/大學,Management,Professional and technical services,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,30~50,硕士/**/碩/博士,Management,Professional and technical services,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,50~,,Undefined,Undefined,0.0,0.0,1.0,0.0,1.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,50~,other-code:0,Undefined,Undefined,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100059,F,50~,中专/**/高中/職,Undefined,Undefined,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# build rating_matF_data by merging rating_data, mat_individual_dat
rating_matF_data = pd.merge(rating_data_with_UF, mat_individual_dat, on=['MaterialID','Client_Sex','birthday','education','JobClassName','IndustryClassName'], how='left')

In [18]:
# sperate adult, jr data by PurchaseBrandID
rating_matF_data_AD = rating_matF_data[rating_matF_data['PurchaseBrandID']==1]
rating_matF_data_Jr = rating_matF_data[rating_matF_data['PurchaseBrandID']!=1]
rating_matF_data_AD = rating_matF_data_AD[list(set(rating_matF_data_AD.columns)-{'session_sn','PurchaseBrandID','Client_Sex','birthday','education','JobClassName','IndustryClassName'})]
rating_matF_data_Jr = rating_matF_data_Jr[list(set(rating_matF_data_Jr.columns)-{'session_sn','PurchaseBrandID','Client_Sex','birthday','education','JobClassName','IndustryClassName'})]

In [19]:
# sperate train, test data by attend_date
start_date = '2021-01-01'
train_date = '2021-04-01'
end_date = '2021-05-01'
train_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= start_date) & (rating_matF_data_AD['attend_date'] < train_date)]
test_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= train_date) & (rating_matF_data_AD['attend_date'] < end_date)]
train_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= start_date) & (rating_matF_data_Jr['attend_date'] < train_date)]
test_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= train_date) & (rating_matF_data_Jr['attend_date'] < end_date)]

In [20]:
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'attend_date'})]
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'attend_date'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'attend_date'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'attend_date'})]

In [21]:
train_data_AD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93158 entries, 0 to 325981
Data columns (total 19 columns):
complaint_EAG     93156 non-null float64
label             93158 non-null float64
complaint_DFG     93156 non-null float64
complaint_ICV     93156 non-null float64
complaint_ECV     93156 non-null float64
complaint_ECR     93156 non-null float64
MaterialID        93158 non-null int64
complaint_EAV     93156 non-null float64
compliment_INT    93156 non-null float64
attend_level      93158 non-null int64
compliment_COR    93156 non-null float64
compliment_PRA    93156 non-null float64
complaint_ICR     93156 non-null float64
client_sn         93158 non-null int64
complaint_ICA     93156 non-null float64
complaint_BOR     93156 non-null float64
complaint_OFA     93156 non-null float64
complaint_ECA     93156 non-null float64
complaint_DFV     93156 non-null float64
dtypes: float64(16), int64(3)
memory usage: 14.2 MB


In [22]:
test_data_AD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32146 entries, 13 to 325996
Data columns (total 19 columns):
complaint_EAG     32146 non-null float64
label             32146 non-null float64
complaint_DFG     32146 non-null float64
complaint_ICV     32146 non-null float64
complaint_ECV     32146 non-null float64
complaint_ECR     32146 non-null float64
MaterialID        32146 non-null int64
complaint_EAV     32146 non-null float64
compliment_INT    32146 non-null float64
attend_level      32146 non-null int64
compliment_COR    32146 non-null float64
compliment_PRA    32146 non-null float64
complaint_ICR     32146 non-null float64
client_sn         32146 non-null int64
complaint_ICA     32146 non-null float64
complaint_BOR     32146 non-null float64
complaint_OFA     32146 non-null float64
complaint_ECA     32146 non-null float64
complaint_DFV     32146 non-null float64
dtypes: float64(16), int64(3)
memory usage: 4.9 MB


In [23]:
train_data_Jr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117187 entries, 62 to 326201
Data columns (total 19 columns):
complaint_EAG     117187 non-null float64
label             117187 non-null float64
complaint_DFG     117187 non-null float64
complaint_ICV     117187 non-null float64
complaint_ECV     117187 non-null float64
complaint_ECR     117187 non-null float64
MaterialID        117187 non-null int64
complaint_EAV     117187 non-null float64
compliment_INT    117187 non-null float64
attend_level      117187 non-null int64
compliment_COR    117187 non-null float64
compliment_PRA    117187 non-null float64
complaint_ICR     117187 non-null float64
client_sn         117187 non-null int64
complaint_ICA     117187 non-null float64
complaint_BOR     117187 non-null float64
complaint_OFA     117187 non-null float64
complaint_ECA     117187 non-null float64
complaint_DFV     117187 non-null float64
dtypes: float64(16), int64(3)
memory usage: 17.9 MB


In [24]:
test_data_Jr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35933 entries, 66 to 326205
Data columns (total 19 columns):
complaint_EAG     35933 non-null float64
label             35933 non-null float64
complaint_DFG     35933 non-null float64
complaint_ICV     35933 non-null float64
complaint_ECV     35933 non-null float64
complaint_ECR     35933 non-null float64
MaterialID        35933 non-null int64
complaint_EAV     35933 non-null float64
compliment_INT    35933 non-null float64
attend_level      35933 non-null int64
compliment_COR    35933 non-null float64
compliment_PRA    35933 non-null float64
complaint_ICR     35933 non-null float64
client_sn         35933 non-null int64
complaint_ICA     35933 non-null float64
complaint_BOR     35933 non-null float64
complaint_OFA     35933 non-null float64
complaint_ECA     35933 non-null float64
complaint_DFV     35933 non-null float64
dtypes: float64(16), int64(3)
memory usage: 5.5 MB


In [25]:
train_data_AD = train_data_AD.dropna()
test_data_AD = test_data_AD.dropna()
train_data_Jr = train_data_Jr.dropna()
test_data_Jr = test_data_Jr.dropna()

In [26]:
label_AD = np.array(train_data_AD['label'])
label_Jr = np.array(train_data_Jr['label'])
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'label'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'label'})]
ground_truth_AD = np.array(test_data_AD['label'])
ground_truth_Jr = np.array(test_data_Jr['label'])
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'label'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'label'})]

In [27]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
def train_model(train_data,label):
    train_data = np.array(train_data)
    rf = RandomForestRegressor()
    rf.fit(train_data, label)
    return rf

In [28]:
model_AD = train_model(train_data=train_data_AD, label=label_AD)
model_Jr = train_model(train_data=train_data_Jr, label=label_Jr)



In [29]:
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
def predict_score(pred_prob, Y_test_array, binary_threshold=0.5):
    pred_one_hot = list()
    for i in range(pred_prob.shape[0]):
        if pred_prob[i] >= binary_threshold:
            pred_one_hot.append(1)
        else:
            pred_one_hot.append(0)
    print(metrics.classification_report(list(Y_test_array), pred_one_hot))
    print('---------------------------------------')
    print('Confusion Matrix')
    print(np.transpose(confusion_matrix(list(Y_test_array), pred_one_hot).T))
    print('---------------------------------------')
    print('positive label : 1 | negative label : 0')

In [30]:
test_data_AD_array = np.array(test_data_AD)
y_pred_AD = model_AD.predict(test_data_AD_array)
predict_score(y_pred_AD,ground_truth_AD)

              precision    recall  f1-score   support

         0.0       0.58      0.45      0.51     11923
         1.0       0.71      0.80      0.76     20223

    accuracy                           0.67     32146
   macro avg       0.64      0.63      0.63     32146
weighted avg       0.66      0.67      0.66     32146

---------------------------------------
Confusion Matrix
[[ 5379  6544]
 [ 3972 16251]]
---------------------------------------
positive label : 1 | negative label : 0


In [31]:
ground_truth_AD_list = list(ground_truth_AD)
pos,neg = 0,0
for val in ground_truth_AD_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_AD))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_AD_list)

              precision    recall  f1-score   support

         0.0       0.37      0.50      0.43     11923
         1.0       0.63      0.51      0.56     20223

    accuracy                           0.50     32146
   macro avg       0.50      0.50      0.50     32146
weighted avg       0.54      0.50      0.51     32146

---------------------------------------
Confusion Matrix
[[ 5992  5931]
 [10009 10214]]
---------------------------------------
positive label : 1 | negative label : 0


In [32]:
test_data_Jr_array = np.array(test_data_Jr)
y_pred_Jr = model_Jr.predict(test_data_Jr_array)
predict_score(y_pred_Jr,ground_truth_Jr)

              precision    recall  f1-score   support

         0.0       0.42      0.30      0.35     12373
         1.0       0.68      0.78      0.73     23560

    accuracy                           0.61     35933
   macro avg       0.55      0.54      0.54     35933
weighted avg       0.59      0.61      0.60     35933

---------------------------------------
Confusion Matrix
[[ 3651  8722]
 [ 5119 18441]]
---------------------------------------
positive label : 1 | negative label : 0


In [33]:
ground_truth_Jr_list = list(ground_truth_Jr)
pos,neg = 0,0
for val in ground_truth_Jr_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_Jr_list))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_Jr_list)

              precision    recall  f1-score   support

         0.0       0.34      0.50      0.41     12373
         1.0       0.66      0.50      0.57     23560

    accuracy                           0.50     35933
   macro avg       0.50      0.50      0.49     35933
weighted avg       0.55      0.50      0.51     35933

---------------------------------------
Confusion Matrix
[[ 6195  6178]
 [11801 11759]]
---------------------------------------
positive label : 1 | negative label : 0
