In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import math
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report

## Feature Engineer

In [2]:
def FFT(xreal, ximag):    
    n = 2
    while(n*2 <= len(xreal)):
        n *= 2
    
    p = int(math.log(n, 2))
    
    for i in range(0, n):
        a = i
        b = 0
        for j in range(0, p):
            b = int(b*2 + a%2)
            a = a//2
        if(b > i):
            xreal[i], xreal[b] = xreal[b], xreal[i]
            ximag[i], ximag[b] = ximag[b], ximag[i]
            
    wreal = []
    wimag = []
        
    arg = float(-2 * math.pi / n)
    treal = float(math.cos(arg))
    timag = float(math.sin(arg))
    
    wreal.append(float(1.0))
    wimag.append(float(0.0))
    
    for j in range(1, int(n/2)):
        wreal.append(wreal[-1] * treal - wimag[-1] * timag)
        wimag.append(wreal[-1] * timag + wimag[-1] * treal)
        
    m = 2
    while(m < n + 1):
        for k in range(0, n, m):
            for j in range(0, int(m/2), 1):
                index1 = k + j
                index2 = int(index1 + m / 2)
                t = int(n * j / m)
                treal = wreal[t] * xreal[index2] - wimag[t] * ximag[index2]
                timag = wreal[t] * ximag[index2] + wimag[t] * xreal[index2]
                ureal = xreal[index1]
                uimag = ximag[index1]
                xreal[index1] = ureal + treal
                ximag[index1] = uimag + timag
                xreal[index2] = ureal - treal
                ximag[index2] = uimag - timag
        m *= 2
        
    return n, xreal, ximag   

In [3]:
def FFT_data(input_data, swinging_times):   
    txtlength = swinging_times[-1] - swinging_times[0]
    a_mean = [0] * txtlength
    g_mean = [0] * txtlength
       
    for num in range(len(swinging_times)-1):
        a = []
        g = []
        for swing in range(swinging_times[num], swinging_times[num+1]):
            a.append(math.sqrt(math.pow((input_data[swing][0] + input_data[swing][1] + input_data[swing][2]), 2)))
            g.append(math.sqrt(math.pow((input_data[swing][3] + input_data[swing][4] + input_data[swing][5]), 2)))

        a_mean[num] = (sum(a) / len(a))
        g_mean[num] = (sum(g) / len(g)) # flag
    
    return a_mean, g_mean

In [4]:
def feature(input_data, swinging_now, swinging_times, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer):
    allsum = []
    mean = []
    var = []
    rms = []
    XYZmean_a = 0
    a = []
    g = []
    a_s1 = 0
    a_s2 = 0
    g_s1 = 0
    g_s2 = 0
    a_k1 = 0
    a_k2 = 0
    g_k1 = 0
    g_k2 = 0
    
    for i in range(len(input_data)):
        if i==0:
            allsum = input_data[i]
            a.append(math.sqrt(math.pow((input_data[i][0] + input_data[i][1] + input_data[i][2]), 2)))
            g.append(math.sqrt(math.pow((input_data[i][3] + input_data[i][4] + input_data[i][5]), 2)))
            continue
        
        a.append(math.sqrt(math.pow((input_data[i][0] + input_data[i][1] + input_data[i][2]), 2)))
        g.append(math.sqrt(math.pow((input_data[i][3] + input_data[i][4] + input_data[i][5]), 2)))
       
        allsum = [allsum[feature_index] + input_data[i][feature_index] for feature_index in range(len(input_data[i]))]
        
    mean = [allsum[feature_index] / len(input_data) for feature_index in range(len(input_data[i]))]
    
    for i in range(len(input_data)):
        if i==0:
            var = input_data[i]
            rms = input_data[i]
            continue

        var = [var[feature_index] + math.pow((input_data[i][feature_index] - mean[feature_index]), 2) for feature_index in range(len(input_data[i]))]
        rms = [rms[feature_index] + math.pow(input_data[i][feature_index], 2) for feature_index in range(len(input_data[i]))]
    
    # flag
    var = [var[feature_index] if var[feature_index] > 0 else 0 for feature_index in range(len(input_data[i]))]
    
    var = [math.sqrt((var[feature_index] / len(input_data))) for feature_index in range(len(input_data[i]))]
    rms = [math.sqrt((rms[feature_index] / len(input_data))) for feature_index in range(len(input_data[i]))]
    
    a_max = [max(a)]
    a_min = [min(a)]
    a_mean = [sum(a) / len(a)]
    g_max = [max(g)]
    g_min = [min(g)]
    g_mean = [sum(g) / len(g)]
    
    a_var = math.sqrt(math.pow((var[0] + var[1] + var[2]), 2))
    
    for i in range(len(input_data)):
        a_s1 = a_s1 + math.pow((a[i] - a_mean[0]), 4)
        a_s2 = a_s2 + math.pow((a[i] - a_mean[0]), 2)
        g_s1 = g_s1 + math.pow((g[i] - g_mean[0]), 4)
        g_s2 = g_s2 + math.pow((g[i] - g_mean[0]), 2)
        a_k1 = a_k1 + math.pow((a[i] - a_mean[0]), 3)
        g_k1 = g_k1 + math.pow((g[i] - g_mean[0]), 3)
    
    a_s1 = a_s1 / len(input_data)
    a_s2 = a_s2 / len(input_data)
    g_s1 = g_s1 / len(input_data)
    g_s2 = g_s2 / len(input_data)
    a_k2 = math.pow(a_s2, 1.5)
    g_k2 = math.pow(g_s2, 1.5)
    a_s2 = a_s2 * a_s2
    g_s2 = g_s2 * g_s2
    
    a_kurtosis = [a_s1 / a_s2]
    g_kurtosis = [g_s1 / g_s2]
    a_skewness = [a_k1 / a_k2]
    g_skewness = [g_k1 / g_k2]
    
    a_fft_mean = 0
    g_fft_mean = 0
    cut = int(n_fft / swinging_times)
    a_psd = []
    g_psd = []
    entropy_a = []
    entropy_g = []
    e1 = []
    e3 = []
    e2 = 0
    e4 = 0
    
    for i in range(cut * swinging_now, cut * (swinging_now + 1)):
        a_fft_mean += a_fft[i]
        g_fft_mean += g_fft[i]
        a_psd.append(math.pow(a_fft[i], 2) + math.pow(a_fft_imag[i], 2))
        g_psd.append(math.pow(g_fft[i], 2) + math.pow(g_fft_imag[i], 2))
        e1.append(math.pow(a_psd[-1], 0.5))
        e3.append(math.pow(g_psd[-1], 0.5))
        
    a_fft_mean = a_fft_mean / cut
    g_fft_mean = g_fft_mean / cut
    
    a_psd_mean = sum(a_psd) / len(a_psd)
    g_psd_mean = sum(g_psd) / len(g_psd)
    
    for i in range(cut):
        e2 += math.pow(a_psd[i], 0.5)
        e4 += math.pow(g_psd[i], 0.5)
    
    for i in range(cut):
        entropy_a.append((e1[i] / e2) * math.log(e1[i] / e2))
        entropy_g.append((e3[i] / e4) * math.log(e3[i] / e4))
    
    a_entropy_mean = sum(entropy_a) / len(entropy_a)
    g_entropy_mean = sum(entropy_g) / len(entropy_g)       
        
    
    output = mean + var + rms + a_max + a_mean + a_min + g_max + g_mean + g_min + [a_fft_mean] + [g_fft_mean] + [a_psd_mean] + [g_psd_mean] + a_kurtosis + g_kurtosis + a_skewness + g_skewness + [a_entropy_mean] + [g_entropy_mean]
    writer.writerow(output)

In [5]:
def data_generate(datapath = './train_data', tar_dir = 'tabular_data_train'):
    pathlist_txt = Path(datapath).glob('**/*.txt')

    
    for file in pathlist_txt:
        f = open(file)

        All_data = []

        count = 0
        for line in f.readlines():
            if line == '\n' or count == 0:
                count += 1
                continue
            num = line.split(' ')
            if len(num) > 5:
                tmp_list = []
                for i in range(6):
                    tmp_list.append(int(num[i]))
                All_data.append(tmp_list)
        
        f.close()

        swing_index = np.linspace(0, len(All_data), 28, dtype = int)
        # filename.append(int(Path(file).stem))
        # all_swing.append([swing_index])

        headerList = ['ax_mean', 'ay_mean', 'az_mean', 'gx_mean', 'gy_mean', 'gz_mean', 'ax_var', 'ay_var', 'az_var', 'gx_var', 'gy_var', 'gz_var', 'ax_rms', 'ay_rms', 'az_rms', 'gx_rms', 'gy_rms', 'gz_rms', 'a_max', 'a_mean', 'a_min', 'g_max', 'g_mean', 'g_min', 'a_fft', 'g_fft', 'a_psd', 'g_psd', 'a_kurt', 'g_kurt', 'a_skewn', 'g_skewn', 'a_entropy', 'g_entropy']                
        

        with open('./{dir}/{fname}.csv'.format(dir = tar_dir, fname = Path(file).stem), 'w', newline = '') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headerList)
            # try:
            #     a_fft, g_fft = FFT_data(All_data, swing_index)
            #     a_fft_imag = [0] * len(a_fft)
            #     g_fft_imag = [0] * len(g_fft)
            #     n_fft, a_fft, a_fft_imag = FFT(a_fft, a_fft_imag)
            #     n_fft, g_fft, g_fft_imag = FFT(g_fft, g_fft_imag)
            #     for i in range(len(swing_index)):
            #         if i==0:
            #             continue
            #         feature(All_data[swing_index[i-1]: swing_index[i]], i - 1, len(swing_index) - 1, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer)
            # except:
            #     print(Path(file).stem)
            #     continue
            a_fft, g_fft = FFT_data(All_data, swing_index)
            a_fft_imag = [0] * len(a_fft)
            g_fft_imag = [0] * len(g_fft)
            n_fft, a_fft, a_fft_imag = FFT(a_fft, a_fft_imag)
            n_fft, g_fft, g_fft_imag = FFT(g_fft, g_fft_imag)
            for i in range(len(swing_index)):
                if i==0:
                    continue
                feature(All_data[swing_index[i-1]: swing_index[i]], i - 1, len(swing_index) - 1, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer)
            

## Training

In [6]:
def display_corr(X_data, y_data, name=''):
    corr_gender = X_data.apply(lambda col: col.corr(y_data[name]))
    corr_gender = corr_gender.sort_values(ascending=False)
    print(name, 'corr')
    print(corr_gender.head(10))  # Top 10 正相關
    print(corr_gender.tail(10))  # Top 10 負相關

    # display(corr_gender)

In [7]:
# Hyper-parameter

In [8]:
def model_binary(X_train, y_train, X_test, y_test, group_size=27, n_estimators=100):
    # clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced')
    # clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
    # clf = HistGradientBoostingClassifier(random_state=42)
    # clf = MLPClassifier()
    num_class = [(y_train == 0).sum(), (y_train == 1).sum()]
    num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
    sample_weight = np.zeros(len(y_train))
    for i in range(2):
        sample_weight[np.where(y_train == i)] = num_class[i]
    
    clf.fit(X_train, y_train, sample_weight=sample_weight)
        
    predicted = clf.predict_proba(X_test)
    # 取出正類（index 1）的概率
    predicted = [predicted[i][1] for i in range(len(predicted))]

    y_pred = []
    num_groups = len(predicted) // group_size 
    for i in range(num_groups):
        now_group = np.array(predicted[i*group_size: (i+1)*group_size])
        pred_label = 0 if (now_group <= 0.5).sum() > (now_group > 0.5).sum() else 1 # 決定他是0還是1
        if pred_label == 0: 
            pos_mask = now_group <= 0.5
            y_pred.append(now_group[pos_mask].min())
        else: 
            pos_mask = now_group > 0.5
            y_pred.append(now_group[pos_mask].max())            
        
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro')
    print(f'ROC {auc_score:.4f}')
    print('==========================')

In [9]:
# 定義多類別分類評分函數 (例如 play years、level)
def model_multiary(X_train, y_train, X_test, y_test, group_size=27, n_estimators=100, classifier=None):
    # clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    # clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    if classifier == 'gradient':
        clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
        num_class = []
        for i in range(max(y_train) + 1):
            num_class.append((y_train == i).sum())
        num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
        sample_weight = np.zeros(len(y_train))
        for i in range(max(y_train) + 1):
            sample_weight[np.where(y_train == i)] = num_class[i]
        print(num_class)
        clf.fit(X_train, y_train, sample_weight=sample_weight)
    else:
        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced_subsample')
        clf.fit(X_train, y_train)
    # clf = HistGradientBoostingClassifier(random_state=42)
    # clf = MLPClassifier()
    predicted = clf.predict_proba(X_test)
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
        num_classes = len(np.unique(y_train))
        group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
        label = np.bincount(group_label).argmax() # 算出最多投票
        pos_mask = group_label == label
        group_pred = group_pred[pos_mask, :].tolist()
        # 對每個類別計算該組內的總機率
        class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
        
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
    class_report = classification_report(y_test_agg, np.argmax(y_pred, axis=1))
    print(f'Multiary AUC: {auc_score:.4f}')
    print('class report')
    print(class_report)
    print('==========================')

In [10]:
def main():
    # 若尚未產生特徵，請先執行 data_generate() 生成特徵 CSV 檔案
    # data_generate()
    
    # 讀取訓練資訊，根據 player_id 將資料分成 80% 訓練、20% 測試
    info = pd.read_csv('train_info.csv')
    unique_players = info['player_id'].unique()
    train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)
    
    # 讀取特徵 CSV 檔（位於 "./tabular_data_train"）
    datapath = './tabular_data_train'
    datalist = list(Path(datapath).glob('**/*.csv'))
    target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=target_mask)
    X_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=target_mask)
    
    for file in datalist:
        unique_id = int(Path(file).stem)
        row = info[info['unique_id'] == unique_id]
        if row.empty:
            continue
        player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        mode = info.loc[info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        # mode_onehot[mode - 1] = 1 # mode : 1-10
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        # from GPT ==================================
        # 查看目前有幾列
        num_rows = len(data)

        # 如果不足 27 列，就補 0 到達 27 列
        if num_rows < 27:
            print('bug')
            num_missing = 27 - num_rows
            missing_rows = pd.DataFrame(0, index=range(num_missing), columns=data.columns)
            data = pd.concat([data, missing_rows], ignore_index=True)
        # from GPT ==================================

        target = row[target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        if player_id in train_players:
            X_train = pd.concat([X_train, data], ignore_index=True)
            y_train = pd.concat([y_train, target_repeated], ignore_index=True)
        elif player_id in test_players:
            X_test = pd.concat([X_test, data], ignore_index=True)
            y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    print('train shape', X_train.shape)
    print('test shape', X_test.shape)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    # 標準化特徵
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
            X_test_func = X_test
        elif mode == 1:
            X_train_func = X_train[columns]
            X_test_func = X_test[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            X_test_func = X_test.drop(columns=columns)
            
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train_func)
        X_test_scaled = scaler.transform(X_test_func)
        return X_train_scaled, X_test_scaled
    # =====================================================================================
    # display_corr(X_train, y_train, 'level')

    # 評分：針對各目標進行模型訓練與評分
    X_train_scaled, X_test_scaled = normalize('gender', 10, 2)
    y_train_le_gender = le.fit_transform(y_train['gender'])
    y_test_le_gender = le.transform(y_test['gender'])
    model_binary(X_train_scaled, y_train_le_gender, X_test_scaled, y_test_le_gender, group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('hold racket handed', mode=0)
    y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
    y_test_le_hold = le.transform(y_test['hold racket handed'])
    model_binary(X_train_scaled, y_train_le_hold, X_test_scaled, y_test_le_hold, group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('play years', 10, mode=0)
    y_train_le_years = le.fit_transform(y_train['play years'])
    y_test_le_years = le.transform(y_test['play years'])
    model_multiary(X_train_scaled, y_train_le_years, X_test_scaled, y_test_le_years, group_size=27, classifier='gradient')
    
    X_train_scaled, X_test_scaled = normalize('level', 10, mode=0)
    y_train_le_level = le.fit_transform(y_train['level'])
    y_test_le_level = le.transform(y_test['level'])
    model_multiary(X_train_scaled, y_train_le_level, X_test_scaled, y_test_le_level, group_size=27, n_estimators=100)

    #AUC SCORE: 0.792(gender) + 0.998(hold) + 0.660(years) + 0.822(levels)


 - Test 好像有缺失值
### Start

In [11]:
# data_generate('./train_data', 'tabular_data_train')

In [12]:
main()

train shape (42039, 35)
test shape (10746, 35)
ROC 0.9359
ROC 1.0000
{0: 5.332191780821918, 1: 2.1837307152875174, 2: 2.8206521739130435}
Multiary AUC: 0.6575
class report
              precision    recall  f1-score   support

           0       0.55      0.56      0.55        95
           1       0.49      0.54      0.51       155
           2       0.75      0.68      0.71       148

    accuracy                           0.59       398
   macro avg       0.60      0.59      0.59       398
weighted avg       0.60      0.59      0.60       398

Multiary AUC: 0.8226
class report
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       101
           1       0.00      0.00      0.00        86
           2       0.00      0.00      0.00        36
           3       0.68      0.86      0.76       175

    accuracy                           0.63       398
   macro avg       0.33      0.47      0.39       398
weighted avg       0.47      0.63

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Predict Submit

In [13]:
def make_submit():   
    train_target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    test_target_mask = ['gender', 'hold racket handed']
    
    # Train Data
    train_datapath = './tabular_data_train'
    train_datalist = list(Path(train_datapath).glob('**/*.csv'))
    
    train_info = pd.read_csv('train_info.csv')
    train_players = train_info['player_id'].unique()
    
    # Test Data
    test_datapath = './tabular_data_test'
    test_datalist = list(Path(test_datapath).glob('**/*.csv'))
    
    test_info = pd.read_csv('test_info.csv')
    test_players = test_info['unique_id']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=train_target_mask)
    X_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=['unique_id'] + test_target_mask)
    y_test['unique_id'] = test_players
    
    # Make Train DF
    for file in train_datalist:
        unique_id = int(Path(file).stem)
        row = train_info[train_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        
        mode = train_info.loc[train_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        target = row[train_target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_train = pd.concat([X_train, data], ignore_index=True)
        y_train = pd.concat([y_train, target_repeated], ignore_index=True)

    # Make Test DF
    for file in test_datalist:
        unique_id = int(Path(file).stem)
        row = test_info[test_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        if data.empty:
            print(file)
            
        mode = test_info.loc[test_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        # target = row[target_mask]
        # target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_test = pd.concat([X_test, data], ignore_index=True)
        # y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    print(X_train.shape)
    print(X_test.shape)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
            X_test_func = X_test
        elif mode == 1:
            X_train_func = X_train[columns]
            X_test_func = X_test[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            X_test_func = X_test.drop(columns=columns)
            
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train_func)
        X_test_scaled = scaler.transform(X_test_func)
        return X_train_scaled, X_test_scaled
    # =====================================================================================
    def model_binary_pred(X_train, y_train, X_test, y_test, name, group_size=27, n_estimators=100):
        # clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced')
        clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
        num_class = [(y_train == 0).sum(), (y_train == 1).sum()]
        num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
        sample_weight = np.zeros(len(y_train))
        for i in range(2):
            sample_weight[np.where(y_train == i)] = num_class[i]
        
        clf.fit(X_train, y_train, sample_weight=sample_weight)
        
        predicted = clf.predict_proba(X_test)
        # 取出正類（index 1）的概率
        predicted = [1 - predicted[i][1] for i in range(len(predicted))]

        y_pred = []
        num_groups = len(predicted) // group_size 
        for i in range(num_groups):
            now_group = np.array(predicted[i*group_size: (i+1)*group_size])
            pred_label = 0 if (now_group <= 0.5).sum() > (now_group > 0.5).sum() else 1 # 決定他是0還是1
            if pred_label == 0: 
                pos_mask = now_group <= 0.5
                y_pred.append(now_group[pos_mask].min())
            else: 
                pos_mask = now_group > 0.5
                y_pred.append(now_group[pos_mask].max())  
            
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        y_test[name] = y_pred
        
    # 定義多類別分類評分函數 (例如 play years、level)
    def model_multiary_pred(X_train, y_train, X_test, y_test, name:list, group_size=27, n_estimators=100, classifier=None):
        if classifier == 'gradient':
            clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
            num_class = []
            for i in range(max(y_train) + 1):
                num_class.append((y_train == i).sum())
            num_class = {i: len(y_train) / num for i, num in enumerate(num_class)}
            print(num_class)
            sample_weight = np.zeros(len(y_train))
            for i in range(max(y_train) + 1):
                sample_weight[np.where(y_train == i)] = num_class[i]
            
            clf.fit(X_train, y_train, sample_weight=sample_weight)
        else:
            clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, class_weight='balanced')
            clf.fit(X_train, y_train)
        predicted = clf.predict_proba(X_test)
        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            group_pred = np.array(predicted[i*group_size: (i+1)*group_size])
            num_classes = len(np.unique(y_train))
            group_label = np.argmax(group_pred, axis=1) # 算出每筆最大類
            label = np.bincount(group_label).argmax() # 算出最多投票
            pos_mask = group_label == label
            group_pred = group_pred[pos_mask, :].tolist()
            # 對每個類別計算該組內的總機率
            class_sums = [sum([group_pred[k][j] for k in range(len(group_pred))]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(len(group_pred))]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])
        y_pred = np.array(y_pred)
        # print(y_pred.shape)
        for i, n in enumerate(name):
            y_test[n] = y_pred[:, i]
        
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
        # print('Multiary AUC:', auc_score)
        
    # =====================================================================================
    # 評分：針對各目標進行模型訓練與評分
    print('Start Prediction')
    X_train_scaled, X_test_scaled = normalize('gender', 10, 2)
    y_train_le_gender = le.fit_transform(y_train['gender'])
    model_binary_pred(X_train_scaled, y_train_le_gender, X_test_scaled, y_test, 'gender', group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('hold racket handed', mode=0)
    y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
    model_binary_pred(X_train_scaled, y_train_le_hold, X_test_scaled, y_test, 'hold racket handed', group_size=27)
    
    X_train_scaled, X_test_scaled = normalize('play years', 10, mode=0)
    y_train_le_years = le.fit_transform(y_train['play years'])
    labels = ['play years_0', 'play years_1', 'play years_2']
    model_multiary_pred(X_train_scaled, y_train_le_years, X_test_scaled, y_test, labels, group_size=27, classifier='')
    
    X_train_scaled, X_test_scaled = normalize('level', 10, mode=0)
    y_train_le_level = le.fit_transform(y_train['level'])
    labels = ['level_2', 'level_3', 'level_4', 'level_5']
    model_multiary_pred(X_train_scaled, y_train_le_level, X_test_scaled, y_test, labels, group_size=27, n_estimators=1400)
    
    y_test.to_csv('submit.csv', index=False)
    print('End')

### Start

In [None]:
# make_submit()

: 