In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import math
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform


## Training

In [2]:
def model_binary(X_train, y_train, X_test, y_test, group_size=27, n_estimators=100):
    clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
    num_class = [(y_train == 0).sum(), (y_train == 1).sum()]
    num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
    sample_weight = np.zeros(len(y_train))
    for i in range(2):
        sample_weight[np.where(y_train == i)] = num_class[i]
    
    clf.fit(X_train, y_train, sample_weight=sample_weight)
        
    predicted = clf.predict_proba(X_test)
    # 取出正類（index 1）的概率
    predicted = [predicted[i][1] for i in range(len(predicted))]

    y_pred = []
    num_groups = len(predicted) // group_size 
    for i in range(num_groups):
        now_group = np.array(predicted[i*group_size: (i+1)*group_size])
        pred_label = 0 if (now_group <= 0.5).sum() > (now_group > 0.5).sum() else 1 # 決定他是0還是1
        if pred_label == 0: 
            pos_mask = now_group <= 0.5
            y_pred.append(now_group[pos_mask].min())
        else: 
            pos_mask = now_group > 0.5
            y_pred.append(now_group[pos_mask].max())            
        
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro')
    print(f'ROC {auc_score:.4f}')
    print('==========================')

In [None]:
# 定義多類別分類評分函數 (例如 play years、level)
def model_multiary(X_train, y_train, X_test, y_test, group_size=27, n_estimators=100, classifier=None):
    y_train_binary = np.logical_or(y_train == 0, y_train == 2)
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced', 
                                     criterion='entropy', oob_score=True)
    clf.fit(X_train, y_train_binary)
    
    y_test_binary = np.logical_or(y_test == 0, y_test == 2).astype(int)
    
    predicted = clf.predict_proba(X_test)
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        # 為了移除Outlier
        group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
        num_classes = len(np.unique(y_train_binary))
        group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
        label = np.bincount(group_label).argmax() # 算出最多投票
        pos_mask = group_label == label
        group_pred = group_pred[pos_mask, :].tolist()
        # 對每個類別計算該組內的總機率
        class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
    
    y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
    auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
    class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
    print(f'Multiary AUC: {auc_score:.4f}')
    print('class report')
    print(class_report)
    print('==========================')
    
    y_pred_tmp = y_pred
    y_test_answer = np.zeros((len(y_test) // 27, 4))
    # ===================================================================================
    y_pred = y_pred_tmp # 恢復
    X_train_tmp = X_train[np.logical_or(y_train == 0, y_train == 2)]
    y_train_tmp = y_train[np.logical_or(y_train == 0, y_train == 2)]
    y_idx = np.repeat(np.arange(len(y_train) // 27), 27)[np.logical_or(y_train == 0, y_train == 2)]
    y_train_binary = y_train_tmp == 2
    clf = SVC(C=1, random_state=42, probability=True, class_weight='balanced', kernel='linear')
    rfe = RFE(clf, n_features_to_select=3)
    rfe.fit(X_train_tmp, y_train_binary)
    X_train_tmp = rfe.transform(X_train_tmp)
        
    clf.fit(X_train_tmp, y_train_binary)
        
    X_test_tmp = []
    y_test_tmp = []
    for i in range(0, len(y_pred)):
        if y_pred[i][1] >= y_pred[i][0]: # 預測為 1 時
            X_test_tmp.extend(X_test[i * 27 : (i + 1) * 27])
            y_test_tmp.extend(y_test[i * 27 : (i + 1) * 27])
    X_test_tmp = np.array(X_test_tmp); y_test_tmp = np.array(y_test_tmp)
    y_test_binary = y_test_tmp == 2
    X_test_tmp = rfe.transform(X_test_tmp)
        
    predicted = clf.predict_proba(X_test_tmp)
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        # 為了移除Outlier
        group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
        num_classes = len(np.unique(y_train_binary))
        group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
        label = np.bincount(group_label).argmax() # 算出最多投票
        pos_mask = group_label == label
        group_pred = group_pred[pos_mask, :].tolist()
        # 對每個類別計算該組內的總機率
        class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
        
    y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
    auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
    class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
    print(f'Multiary AUC: {auc_score:.4f}')
    print('class report')
    print(class_report)
    print('==========================')
    
    for j, num in zip(y_idx, y_pred):
        y_test_answer[j][0] = num[0]
        y_test_answer[j][2] = num[1]
    # ===================================================================================
    y_pred = y_pred_tmp
    X_train_tmp = X_train[np.logical_or(y_train == 1, y_train == 3)]
    y_train_tmp = y_train[np.logical_or(y_train == 1, y_train == 3)]
    y_idx = np.repeat(np.arange(len(y_train) // 27), 27)[np.logical_or(y_train == 1, y_train == 3)]
    y_train_binary = y_train_tmp == 3
    clf = SVC(C=1, random_state=42, probability=True, class_weight='balanced', kernel='linear') # 0.7405
    rfe = RFE(clf, n_features_to_select=16)
    rfe.fit(X_train_tmp, y_train_binary)
    X_train_tmp = rfe.transform(X_train_tmp)
            
    clf.fit(X_train_tmp, y_train_binary)
            
    X_test_tmp = []
    y_test_tmp = []
    for i in range(0, len(y_pred)):
        if y_pred[i][1] < y_pred[i][0]: # 預測為 0 時
            X_test_tmp.extend(X_test[i * 27 : (i + 1) * 27])
            y_test_tmp.extend(y_test[i * 27 : (i + 1) * 27])
    X_test_tmp = np.array(X_test_tmp); y_test_tmp = np.array(y_test_tmp)
    y_test_binary = y_test_tmp == 3
    X_test_tmp = rfe.transform(X_test_tmp)
            
    predicted = clf.predict_proba(X_test_tmp)
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        # 為了移除Outlier
        group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
        num_classes = len(np.unique(y_train_binary))
        group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
        label = np.bincount(group_label).argmax() # 算出最多投票
        pos_mask = group_label == label
        group_pred = group_pred[pos_mask, :].tolist()
        # 對每個類別計算該組內的總機率
        class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
            
    y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
    auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
    class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
    print(f'Multiary AUC: {auc_score:.4f}')
    print('class report')
    print(class_report)
    print('==========================')
    
    for j, num in zip(y_idx, y_pred):
        y_test_answer[j][1] = num[0]
        y_test_answer[j][3] = num[1]
        
    # ==================================
    num_groups = len(y_test) // group_size
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    y_test_answer_norm = y_test_answer / np.sum(y_test_answer, axis=1, keepdims=True)
    auc_score = roc_auc_score(y_test_agg, y_test_answer_norm, average='micro', multi_class='ovr') # , multi_class='ovr'
    class_report = classification_report(y_test_agg, np.argmax(y_test_answer, axis=-1))
    

In [4]:
def main():
    # 若尚未產生特徵，請先執行 data_generate() 生成特徵 CSV 檔案
    # data_generate()
    
    # 讀取訓練資訊，根據 player_id 將資料分成 80% 訓練、20% 測試
    info = pd.read_csv('train_info.csv')
    unique_players = info['player_id'].unique()
    train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)
    
    # 讀取特徵 CSV 檔（位於 "./tabular_data_train"）
    datapath = './tabular_data_train'
    datalist = list(Path(datapath).glob('**/*.csv'))
    target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=target_mask)
    X_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=target_mask)
    
    for file in datalist:
        unique_id = int(Path(file).stem)
        row = info[info['unique_id'] == unique_id]
        if row.empty:
            continue
        player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        # 後來有把mode加入考慮
        # label = 0 代表 mode 為 1 - 8
        # label = 1 代表 mode 為 9 - 10
        mode = info.loc[info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))

        target = row[target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        if player_id in train_players:
            X_train = pd.concat([X_train, data], ignore_index=True)
            y_train = pd.concat([y_train, target_repeated], ignore_index=True)
        elif player_id in test_players:
            X_test = pd.concat([X_test, data], ignore_index=True)
            y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    print('train shape', X_train.shape)
    print('test shape', X_test.shape)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    # 標準化特徵
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        # 這邊的bound是搭配mode
        # 當 mode == 1 時，會選取前bound筆正相關資料
        # 當 mode == 2 時，會移除前bound筆負相關資料
        # mode == 0 時不改變(後來都設成mode = 0了)
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
            X_test_func = X_test
        elif mode == 1:
            X_train_func = X_train[columns]
            X_test_func = X_test[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            X_test_func = X_test.drop(columns=columns)
            
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train_func)
        X_test_scaled = scaler.transform(X_test_func)
        return X_train_scaled, X_test_scaled
    # =====================================================================================
    # 評分：針對各目標進行模型訓練與評分
    
    # gender、hand、play year 沒有訓練，專心提升 level 的準確度
    # 所以沒有程式碼
    
    X_train_scaled, X_test_scaled = normalize('level', 10, mode=0)
    y_train_le_level = le.fit_transform(y_train['level'])
    y_test_le_level = le.transform(y_test['level'])
    # 進行 Oversampling
    target_class = np.bincount(y_train_le_level).max() # label = 3 的數量(最多)
    print('target class num', target_class)
    # 想法 
    # 使label = 1 or 2 能夠與 label = 3 數量相當
    # label = 0 數量夠多了，所以不用 Oversampling
    weights = {i : target_class for i in range(1, 3)}
    print('weights', weights)
    strategy = SMOTE(sampling_strategy=weights, random_state=42)
    X_train_scaled, y_train_le_level = strategy.fit_resample(X_train_scaled, y_train_le_level)
    # 預測
    model_multiary(X_train_scaled, y_train_le_level, X_test_scaled, y_test_le_level, group_size=27, n_estimators=100)

    #AUC SCORE: 0.792(gender) + 0.998(hold) + 0.660(years) + 0.822(levels)


 - Test 好像有缺失值
### Start

In [5]:
# data_generate('./train_data', 'tabular_data_train')

In [6]:
main()

train shape (42039, 35)
test shape (10746, 35)
target class num 19656
weights {1: 19656, 2: 19656}
Multiary AUC: 0.9970
class report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       261
           1       0.99      1.00      1.00       137

    accuracy                           1.00       398
   macro avg       1.00      1.00      1.00       398
weighted avg       1.00      1.00      1.00       398

Multiary AUC: 0.7827
class report
              precision    recall  f1-score   support

       False       0.77      0.91      0.83       102
        True       0.47      0.22      0.30        36

    accuracy                           0.73       138
   macro avg       0.62      0.57      0.57       138
weighted avg       0.69      0.73      0.70       138

Multiary AUC: 0.8438
class report
              precision    recall  f1-score   support

       False       0.60      0.81      0.69        86
        True       0.89      0.74   

ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes

## Predict Submit

In [None]:
def make_submit():   
    train_target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    test_target_mask = ['gender', 'hold racket handed']
    
    # Train Data
    train_datapath = './tabular_data_train'
    train_datalist = list(Path(train_datapath).glob('**/*.csv'))
    
    train_info = pd.read_csv('train_info.csv')
    train_players = train_info['player_id'].unique()
    
    # Test Data
    test_datapath = './tabular_data_test'
    test_datalist = list(Path(test_datapath).glob('**/*.csv'))
    
    test_info = pd.read_csv('test_info.csv')
    test_players = test_info['unique_id']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=train_target_mask)
    X_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=['unique_id'] + test_target_mask)
    y_test['unique_id'] = test_players
    
    # Make Train DF
    for file in train_datalist:
        unique_id = int(Path(file).stem)
        row = train_info[train_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        
        mode = train_info.loc[train_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        target = row[train_target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_train = pd.concat([X_train, data], ignore_index=True)
        y_train = pd.concat([y_train, target_repeated], ignore_index=True)

    # Make Test DF
    for file in test_datalist:
        unique_id = int(Path(file).stem)
        row = test_info[test_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        if data.empty:
            print(file)
            
        mode = test_info.loc[test_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        # target = row[target_mask]
        # target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_test = pd.concat([X_test, data], ignore_index=True)
        # y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    print(X_train.shape)
    print(X_test.shape)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
            X_test_func = X_test
        elif mode == 1:
            X_train_func = X_train[columns]
            X_test_func = X_test[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            X_test_func = X_test.drop(columns=columns)
            
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train_func)
        X_test_scaled = scaler.transform(X_test_func)
        return X_train_scaled, X_test_scaled
    # =====================================================================================
    def model_binary_pred(X_train, y_train, X_test, y_test, name, group_size=27, n_estimators=100):
        # clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced')
        clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
        num_class = [(y_train == 0).sum(), (y_train == 1).sum()]
        num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
        sample_weight = np.zeros(len(y_train))
        for i in range(2):
            sample_weight[np.where(y_train == i)] = num_class[i]
        
        clf.fit(X_train, y_train, sample_weight=sample_weight)
        
        predicted = clf.predict_proba(X_test)
        # 取出正類（index 1）的概率
        predicted = [1 - predicted[i][1] for i in range(len(predicted))]

        y_pred = []
        num_groups = len(predicted) // group_size 
        for i in range(num_groups):
            now_group = np.array(predicted[i*group_size: (i+1)*group_size])
            pred_label = 0 if (now_group <= 0.5).sum() > (now_group > 0.5).sum() else 1 # 決定他是0還是1
            if pred_label == 0: 
                pos_mask = now_group <= 0.5
                y_pred.append(now_group[pos_mask].min())
            else: 
                pos_mask = now_group > 0.5
                y_pred.append(now_group[pos_mask].max())  
            
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        y_test[name] = y_pred
        
    # 定義多類別分類評分函數 (例如 play years、level)
    def model_multiary_pred(X_train, y_train, X_test, y_test, name:list, group_size=27, n_estimators=100, classifier=None):
        y_train_binary = np.logical_or(y_train == 0, y_train == 2)
        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced', 
                                        criterion='entropy', oob_score=True)
        clf.fit(X_train, y_train_binary)
        
        # y_test_binary = np.logical_or(y_test == 0, y_test == 2).astype(int)
        
        predicted = clf.predict_proba(X_test)
        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            # 為了移除Outlier
            group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
            num_classes = len(np.unique(y_train_binary))
            group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
            label = np.bincount(group_label).argmax() # 算出最多投票
            pos_mask = group_label == label
            group_pred = group_pred[pos_mask, :].tolist()
            # 對每個類別計算該組內的總機率
            class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])
        
        # y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
        # class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
        # print(f'Multiary AUC: {auc_score:.4f}')
        # print('class report')
        # print(class_report)
        # print('==========================')
        
        y_pred_tmp = y_pred
        y_test_answer = np.zeros((len(X_test) // 27, 4))
        # ===================================================================================
        print('predict 0 2')
        y_pred = y_pred_tmp # 恢復
        X_train_tmp = X_train[np.logical_or(y_train == 0, y_train == 2)]
        y_train_tmp = y_train[np.logical_or(y_train == 0, y_train == 2)]
        y_idx = np.repeat(np.arange(len(y_train) // 27), 27)[np.logical_or(y_train == 0, y_train == 2)]
        y_train_binary = y_train_tmp == 2
        clf = SVC(C=1, random_state=42, probability=True, class_weight='balanced', kernel='linear')
        rfe = RFE(clf, n_features_to_select=3)
        rfe.fit(X_train_tmp, y_train_binary)
        X_train_tmp = rfe.transform(X_train_tmp)
            
        clf.fit(X_train_tmp, y_train_binary)
            
        X_test_tmp = []
        # y_test_tmp = []
        for i in range(0, len(y_pred)):
            if y_pred[i][1] >= y_pred[i][0]: # 預測為 1 時
                X_test_tmp.extend(X_test[i * 27 : (i + 1) * 27])
                # y_test_tmp.extend(y_test[i * 27 : (i + 1) * 27])
        X_test_tmp = np.array(X_test_tmp); 
        # y_test_tmp = np.array(y_test_tmp)
        # y_test_binary = y_test_tmp == 2
        X_test_tmp = rfe.transform(X_test_tmp)
            
        predicted = clf.predict_proba(X_test_tmp)
        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            # 為了移除Outlier
            group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
            num_classes = len(np.unique(y_train_binary))
            group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
            label = np.bincount(group_label).argmax() # 算出最多投票
            pos_mask = group_label == label
            group_pred = group_pred[pos_mask, :].tolist()
            # 對每個類別計算該組內的總機率
            class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])
            
        # y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
        # class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
        # print(f'Multiary AUC: {auc_score:.4f}')
        # print('class report')
        # print(class_report)
        # print('==========================')
        
        for j, num in zip(y_idx, y_pred):
            y_test_answer[j][0] = num[0]
            y_test_answer[j][2] = num[1]
        # ===================================================================================
        print('predict 1 3')
        y_pred = y_pred_tmp
        X_train_tmp = X_train[np.logical_or(y_train == 1, y_train == 3)]
        y_train_tmp = y_train[np.logical_or(y_train == 1, y_train == 3)]
        y_idx = np.repeat(np.arange(len(y_train) // 27), 27)[np.logical_or(y_train == 1, y_train == 3)]
        y_train_binary = y_train_tmp == 3
        clf = SVC(C=1, random_state=42, probability=True, class_weight='balanced', kernel='linear') # 0.7405
        rfe = RFE(clf, n_features_to_select=16)
        rfe.fit(X_train_tmp, y_train_binary)
        X_train_tmp = rfe.transform(X_train_tmp)
                
        clf.fit(X_train_tmp, y_train_binary)
                
        X_test_tmp = []
        # y_test_tmp = []
        for i in range(0, len(y_pred)):
            if y_pred[i][1] < y_pred[i][0]: # 預測為 0 時
                X_test_tmp.extend(X_test[i * 27 : (i + 1) * 27])
                # y_test_tmp.extend(y_test[i * 27 : (i + 1) * 27])
        X_test_tmp = np.array(X_test_tmp); 
        # y_test_tmp = np.array(y_test_tmp)
        # y_test_binary = y_test_tmp == 3
        X_test_tmp = rfe.transform(X_test_tmp)
                
        predicted = clf.predict_proba(X_test_tmp)
        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            # 為了移除Outlier
            group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
            num_classes = len(np.unique(y_train_binary))
            group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
            label = np.bincount(group_label).argmax() # 算出最多投票
            pos_mask = group_label == label
            group_pred = group_pred[pos_mask, :].tolist()
            # 對每個類別計算該組內的總機率
            class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])
                
        # y_test_agg = [y_test_binary[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, np.array(y_pred)[:, 1], average='micro') # , multi_class='ovr'
        # class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=-1))
        # print(f'Multiary AUC: {auc_score:.4f}')
        # print('class report')
        # print(class_report)
        # print('==========================')
        
        for j, num in zip(y_idx, y_pred):
            y_test_answer[j][1] = num[0]
            y_test_answer[j][3] = num[1]
            
        # ==================================
        # num_groups = len(y_test) // group_size
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, y_test_answer, average='micro', multi_class='ovr') # , multi_class='ovr'
        # class_report = classification_report(y_test_agg, np.argmax(y_test_answer, axis=-1))
        
        # print(y_pred.shape)
        for i, n in enumerate(name):
            y_test[n] = y_test_answer[:, i]
        
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
        # print('Multiary AUC:', auc_score)
        
    # =====================================================================================
    # 評分：針對各目標進行模型訓練與評分
    print('Start Prediction')
    X_train_scaled, X_test_scaled = normalize('gender', 10, 2)
    y_train_le_gender = le.fit_transform(y_train['gender'])
    # model_binary_pred(X_train_scaled, y_train_le_gender, X_test_scaled, y_test, 'gender', group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('hold racket handed', mode=0)
    y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
    # model_binary_pred(X_train_scaled, y_train_le_hold, X_test_scaled, y_test, 'hold racket handed', group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('play years', 10, mode=0)
    y_train_le_years = le.fit_transform(y_train['play years'])
    labels = ['play years_0', 'play years_1', 'play years_2']
    # model_multiary_pred(X_train_scaled, y_train_le_years, X_test_scaled, y_test, labels, group_size=27, classifier='')
    
    X_train_scaled, X_test_scaled = normalize('level', 10, mode=0)
    y_train_le_level = le.fit_transform(y_train['level'])
    labels = ['level_2', 'level_3', 'level_4', 'level_5']
    target_class = np.bincount(y_train_le_level).max()
    print('target class num', target_class)
    weights = {i : target_class for i in range(1, 3)}
    print('weights', weights)
    strategy = SMOTE(sampling_strategy=weights)
    X_train_scaled, y_train_le_level = strategy.fit_resample(X_train_scaled, y_train_le_level)
    model_multiary_pred(X_train_scaled, y_train_le_level, X_test_scaled, y_test, labels, group_size=27, n_estimators=100)
    
    y_test.to_csv('submit.csv', index=False)
    print('End')

### Start

In [None]:
make_submit()

(52785, 35)
(38610, 35)
Start Prediction
target class num 24381
weights {1: 24381, 2: 24381}
predict 0 2
predict 1 3
End
