In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# load data
rating_data = pd.read_csv('data/rating_BETA_Jan.csv', encoding='utf-8-sig')
user_data = pd.read_csv('data/user_feature_BETA_Jan.csv', encoding='utf-8-sig')
mat_data = pd.read_csv('data/material_feature_BETA_Jan.csv', encoding='utf-8-sig')
review_data = pd.read_csv('data/review_BETA_Jan.csv', encoding='utf-8-sig')

In [3]:
# only materialpointsCNT == 1 in rating_data
# w/o repeat mat score in rating_data
rating_data = rating_data[rating_data['materialpointsCNT'] == 1]
uid_list = list(set(rating_data['client_sn']))
rating_data_wo_repeat = list()
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    if len(set(dat['material_points'])) > 1:
        rating_data_wo_repeat.append(dat)
rating_data = pd.concat(rating_data_wo_repeat).reset_index(drop=True) 

100%|██████████| 36787/36787 [00:34<00:00, 1073.01it/s]


In [4]:
# add label feature to rating_data
import numpy as np
rating_data['label'] = [np.nan for _ in range(rating_data.shape[0])]
uid_list = list(set(rating_data['client_sn']))
for uid in tqdm(uid_list):
    dat = rating_data[rating_data['client_sn'] == uid]
    index = dat.index
    score_list = list(dat['material_points'])
    max_score = max(score_list)
    label_list = []
    for score in score_list:
        if score == max_score:
            label_list.append(1)
        else:
            label_list.append(0)
    rating_data.loc[index, 'label']   = label_list

100%|██████████| 19086/19086 [00:44<00:00, 428.96it/s]


In [5]:
# select 'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label' as feature in rating_data
rating_data = rating_data[['client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','attend_date','label']]

In [6]:
# build rating_review_data by merging rating_data, review_data. where key = ['client_sn','MaterialID','session_sn'] (left join)
rating_review_data = pd.merge(rating_data, review_data, on=['client_sn','MaterialID','session_sn'], how='left')

In [7]:
def transform_date_to_age(date_str, categorical=True):
    if date_str != 'None':
        age_val = 2021 - pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S').year
        if categorical is False:
            return age_val
        else:
            if age_val <= 30:
                return '0~30'
            elif age_val > 30 and age_val < 50:
                return '30~50'
            else:
                return '50~'   
    else:
        return 'None'

In [8]:
user_data = user_data.fillna('None')
user_data['Client_Sex'].replace('N','None')
user_data['birthday'] = user_data['birthday'].apply(lambda x: transform_date_to_age(x))
user_data['JobClassName'].replace('Undefined','None')
user_data['IndustryClassName'].replace('Undefined','None')
user_data_with_it = user_data[['client_sn','Client_Sex','birthday','education','JobClassName','IndustryClassName','user_interest_tag_list']]
user_data = user_data[['client_sn','Client_Sex','birthday','education','JobClassName','IndustryClassName']]

In [9]:
rating_review_data_with_UF = pd.merge(rating_review_data, user_data, on=['client_sn'], how='left')

In [10]:
rating_data_with_UF = pd.merge(rating_data, user_data_with_it, on=['client_sn'], how='left')
mat_data = mat_data[['MaterialID', 'MDCGSID_ENname']]
rating_data_with_UF = pd.merge(rating_data_with_UF, mat_data, on=['MaterialID'], how='left')

In [11]:
rating_data_with_UF.head(1)

Unnamed: 0,client_sn,MaterialID,session_sn,PurchaseBrandID,attend_level,attend_date,label,Client_Sex,birthday,education,JobClassName,IndustryClassName,user_interest_tag_list,MDCGSID_ENname
0,10354714,118145,2021010422472454,1,11,2021-01-04,1.0,,,,,,"Management/**/Training, Education and School/*...","Health, Fitness and Medicine/**/Health & Medicine"


In [12]:
from object_orient_for_JL.interest_tag_overlap_num import overlap_num_func_main

In [13]:
rating_data_with_UF = overlap_num_func_main(rating_data_with_UF)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_data_with_UF['overlap_num'] = overlap_num


In [14]:
rating_data_with_UF.head(1)

Unnamed: 0,JobClassName,education,client_sn,attend_date,label,MaterialID,Client_Sex,IndustryClassName,attend_level,PurchaseBrandID,birthday,session_sn,overlap_num
0,,,10354714,2021-01-04,1.0,118145,,,11,1,,2021010422472454,0


In [15]:
mat_individual_col = list(set(rating_review_data_with_UF.columns)-{'client_sn','MaterialID','session_sn','PurchaseBrandID','attend_level','material_points','con_sn','label','attend_date','Client_Sex','birthday','education','JobClassName','IndustryClassName'})
mat_individual_dat = rating_review_data_with_UF.groupby(['MaterialID','Client_Sex','birthday','education','JobClassName','IndustryClassName']).mean()[mat_individual_col]

In [16]:
mat_individual_dat.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,complaint_EAV,compliment_INT,count_materials_points,count_overall_points,M_Point,complaint_EAG,complaint_ECA,complaint_ICV,complaint_ECV,compliment_COR,...,complaint_DFV,complaint_BOR,complaint_OFA,complaint_DFG,C_Point,compliment_PRA,T_Point,complaint_ICR,count_consultant_points,complaint_ECR
MaterialID,Client_Sex,birthday,education,JobClassName,IndustryClassName,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
100059,F,0~30,,Undefined,Undefined,0.333333,0.333333,1.0,1.0,8.666667,0.333333,0.0,0.333333,0.0,0.0,...,0.0,0.333333,0.0,0.0,9.0,0.0,9.666667,0.333333,1.0,0.0
100059,F,30~50,,General Staff,Marketing and Sales,0.0,0.0,1.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,6.0,0.0,1.0,0.0
100059,F,30~50,,Undefined,Undefined,0.0,0.375,1.0,1.0,9.375,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.625,0.375,9.625,0.0,1.0,0.0
100059,F,30~50,other-code:0,Undefined,Undefined,0.0,0.75,1.0,1.0,9.75,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,9.75,0.5,10.0,0.0,1.0,0.0
100059,F,30~50,大专/**/專科/大學,General Staff,Manufacturing and retailing,0.0,1.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,8.0,0.0,1.0,0.0
100059,F,30~50,大专/**/專科/大學,Management,Professional and technical services,0.0,1.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,10.0,1.0,10.0,0.0,1.0,0.0
100059,F,30~50,硕士/**/碩/博士,Management,Professional and technical services,0.0,0.0,1.0,1.0,8.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,0.0,10.0,1.0,1.0,0.0
100059,F,50~,,Undefined,Undefined,0.0,1.0,1.0,1.0,9.666667,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,9.333333,0.666667,9.333333,0.0,1.0,0.0
100059,F,50~,other-code:0,Undefined,Undefined,0.0,1.0,1.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,10.0,0.0,1.0,0.0
100059,F,50~,中专/**/高中/職,Undefined,Undefined,0.0,0.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,1.0,10.0,0.0,1.0,0.0


In [17]:
# build rating_matF_data by merging rating_data, mat_individual_dat
rating_matF_data = pd.merge(rating_data_with_UF, mat_individual_dat, on=['MaterialID','Client_Sex','birthday','education','JobClassName','IndustryClassName'], how='left')

In [18]:
rating_matF_data.columns

Index(['JobClassName', 'education', 'client_sn', 'attend_date', 'label',
       'MaterialID', 'Client_Sex', 'IndustryClassName', 'attend_level',
       'PurchaseBrandID', 'birthday', 'session_sn', 'overlap_num',
       'complaint_EAV', 'compliment_INT', 'count_materials_points',
       'count_overall_points', 'M_Point', 'complaint_EAG', 'complaint_ECA',
       'complaint_ICV', 'complaint_ECV', 'compliment_COR', 'complaint_ICA',
       'complaint_DFV', 'complaint_BOR', 'complaint_OFA', 'complaint_DFG',
       'C_Point', 'compliment_PRA', 'T_Point', 'complaint_ICR',
       'count_consultant_points', 'complaint_ECR'],
      dtype='object')

In [19]:
# sperate adult, jr data by PurchaseBrandID
rating_matF_data_AD = rating_matF_data[rating_matF_data['PurchaseBrandID']==1]
rating_matF_data_Jr = rating_matF_data[rating_matF_data['PurchaseBrandID']!=1]
rating_matF_data_AD = rating_matF_data_AD[list(set(rating_matF_data_AD.columns)-{'session_sn','PurchaseBrandID','Client_Sex','birthday','education','JobClassName','IndustryClassName'})]
rating_matF_data_Jr = rating_matF_data_Jr[list(set(rating_matF_data_Jr.columns)-{'session_sn','PurchaseBrandID','Client_Sex','birthday','education','JobClassName','IndustryClassName'})]

In [20]:
# sperate train, test data by attend_date
start_date = '2021-01-01'
train_date = '2021-04-01'
end_date = '2021-05-01'
train_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= start_date) & (rating_matF_data_AD['attend_date'] < train_date)]
test_data_AD = rating_matF_data_AD[(rating_matF_data_AD['attend_date'] >= train_date) & (rating_matF_data_AD['attend_date'] < end_date)]
train_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= start_date) & (rating_matF_data_Jr['attend_date'] < train_date)]
test_data_Jr = rating_matF_data_Jr[(rating_matF_data_Jr['attend_date'] >= train_date) & (rating_matF_data_Jr['attend_date'] < end_date)]

In [21]:
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'attend_date'})]
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'attend_date'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'attend_date'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'attend_date'})]

In [22]:
train_data_AD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94212 entries, 0 to 374861
Data columns (total 26 columns):
complaint_ECV              94210 non-null float64
compliment_COR             94210 non-null float64
complaint_ICA              94210 non-null float64
overlap_num                94212 non-null int64
complaint_EAV              94210 non-null float64
MaterialID                 94212 non-null int64
compliment_INT             94210 non-null float64
complaint_DFV              94210 non-null float64
count_materials_points     94210 non-null float64
attend_level               94212 non-null int64
count_overall_points       94210 non-null float64
M_Point                    94210 non-null float64
complaint_BOR              94210 non-null float64
complaint_OFA              94210 non-null float64
complaint_DFG              94210 non-null float64
complaint_EAG              94210 non-null float64
C_Point                    94210 non-null float64
compliment_PRA             94210 non-null floa

In [23]:
test_data_AD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32514 entries, 13 to 374878
Data columns (total 26 columns):
complaint_ECV              32514 non-null float64
compliment_COR             32514 non-null float64
complaint_ICA              32514 non-null float64
overlap_num                32514 non-null int64
complaint_EAV              32514 non-null float64
MaterialID                 32514 non-null int64
compliment_INT             32514 non-null float64
complaint_DFV              32514 non-null float64
count_materials_points     32514 non-null float64
attend_level               32514 non-null int64
count_overall_points       32514 non-null float64
M_Point                    32514 non-null float64
complaint_BOR              32514 non-null float64
complaint_OFA              32514 non-null float64
complaint_DFG              32514 non-null float64
complaint_EAG              32514 non-null float64
C_Point                    32514 non-null float64
compliment_PRA             32514 non-null flo

In [24]:
train_data_Jr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120282 entries, 66 to 375129
Data columns (total 26 columns):
complaint_ECV              120282 non-null float64
compliment_COR             120282 non-null float64
complaint_ICA              120282 non-null float64
overlap_num                120282 non-null int64
complaint_EAV              120282 non-null float64
MaterialID                 120282 non-null int64
compliment_INT             120282 non-null float64
complaint_DFV              120282 non-null float64
count_materials_points     120282 non-null float64
attend_level               120282 non-null int64
count_overall_points       120282 non-null float64
M_Point                    120282 non-null float64
complaint_BOR              120282 non-null float64
complaint_OFA              120282 non-null float64
complaint_DFG              120282 non-null float64
complaint_EAG              120282 non-null float64
C_Point                    120282 non-null float64
compliment_PRA             

In [25]:
test_data_Jr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36852 entries, 70 to 375133
Data columns (total 26 columns):
complaint_ECV              36852 non-null float64
compliment_COR             36852 non-null float64
complaint_ICA              36852 non-null float64
overlap_num                36852 non-null int64
complaint_EAV              36852 non-null float64
MaterialID                 36852 non-null int64
compliment_INT             36852 non-null float64
complaint_DFV              36852 non-null float64
count_materials_points     36852 non-null float64
attend_level               36852 non-null int64
count_overall_points       36852 non-null float64
M_Point                    36852 non-null float64
complaint_BOR              36852 non-null float64
complaint_OFA              36852 non-null float64
complaint_DFG              36852 non-null float64
complaint_EAG              36852 non-null float64
C_Point                    36852 non-null float64
compliment_PRA             36852 non-null flo

In [26]:
train_data_AD = train_data_AD.dropna()
test_data_AD = test_data_AD.dropna()
train_data_Jr = train_data_Jr.dropna()
test_data_Jr = test_data_Jr.dropna()

In [27]:
label_AD = np.array(train_data_AD['label'])
label_Jr = np.array(train_data_Jr['label'])
train_data_AD = train_data_AD[list(set(train_data_AD.columns)-{'label'})]
train_data_Jr = train_data_Jr[list(set(train_data_Jr.columns)-{'label'})]
ground_truth_AD = np.array(test_data_AD['label'])
ground_truth_Jr = np.array(test_data_Jr['label'])
test_data_AD = test_data_AD[list(set(test_data_AD.columns)-{'label'})]
test_data_Jr = test_data_Jr[list(set(test_data_Jr.columns)-{'label'})]

In [28]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
def train_model(train_data,label):
    train_data = np.array(train_data)
    rf = RandomForestRegressor()
    rf.fit(train_data, label)
    return rf

In [29]:
model_AD = train_model(train_data=train_data_AD, label=label_AD)
model_Jr = train_model(train_data=train_data_Jr, label=label_Jr)



In [30]:
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
def predict_score(pred_prob, Y_test_array, binary_threshold=0.5):
    pred_one_hot = list()
    for i in range(pred_prob.shape[0]):
        if pred_prob[i] >= binary_threshold:
            pred_one_hot.append(1)
        else:
            pred_one_hot.append(0)
    print(metrics.classification_report(list(Y_test_array), pred_one_hot))
    print('---------------------------------------')
    print('Confusion Matrix')
    print(np.transpose(confusion_matrix(list(Y_test_array), pred_one_hot).T))
    print('---------------------------------------')
    print('positive label : 1 | negative label : 0')

In [31]:
test_data_AD_array = np.array(test_data_AD)
y_pred_AD = model_AD.predict(test_data_AD_array)
predict_score(y_pred_AD,ground_truth_AD)

              precision    recall  f1-score   support

         0.0       0.70      0.64      0.67     12074
         1.0       0.80      0.84      0.82     20440

    accuracy                           0.77     32514
   macro avg       0.75      0.74      0.74     32514
weighted avg       0.76      0.77      0.76     32514

---------------------------------------
Confusion Matrix
[[ 7713  4361]
 [ 3231 17209]]
---------------------------------------
positive label : 1 | negative label : 0


In [32]:
ground_truth_AD_list = list(ground_truth_AD)
pos,neg = 0,0
for val in ground_truth_AD_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_AD))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_AD_list)

              precision    recall  f1-score   support

         0.0       0.37      0.50      0.43     12074
         1.0       0.63      0.50      0.56     20440

    accuracy                           0.50     32514
   macro avg       0.50      0.50      0.49     32514
weighted avg       0.54      0.50      0.51     32514

---------------------------------------
Confusion Matrix
[[ 6094  5980]
 [10210 10230]]
---------------------------------------
positive label : 1 | negative label : 0


In [33]:
test_data_Jr_array = np.array(test_data_Jr)
y_pred_Jr = model_Jr.predict(test_data_Jr_array)
predict_score(y_pred_Jr,ground_truth_Jr)

              precision    recall  f1-score   support

         0.0       0.45      0.34      0.39     12546
         1.0       0.70      0.79      0.74     24306

    accuracy                           0.64     36852
   macro avg       0.58      0.56      0.56     36852
weighted avg       0.61      0.64      0.62     36852

---------------------------------------
Confusion Matrix
[[ 4297  8249]
 [ 5201 19105]]
---------------------------------------
positive label : 1 | negative label : 0


In [34]:
ground_truth_Jr_list = list(ground_truth_Jr)
pos,neg = 0,0
for val in ground_truth_Jr_list:
    if int(val) == 1:
        pos +=1
    else:
        neg +=1
import random
random_pred = np.array([random.sample([1,0],1)[0] for _ in range(len(ground_truth_Jr_list))])
import sklearn.metrics as metrics 
from sklearn.metrics import confusion_matrix
predict_score(random_pred,ground_truth_Jr_list)

              precision    recall  f1-score   support

         0.0       0.34      0.50      0.40     12546
         1.0       0.66      0.49      0.56     24306

    accuracy                           0.50     36852
   macro avg       0.50      0.50      0.48     36852
weighted avg       0.55      0.50      0.51     36852

---------------------------------------
Confusion Matrix
[[ 6265  6281]
 [12287 12019]]
---------------------------------------
positive label : 1 | negative label : 0
