In [20]:
from pathlib import Path
import numpy as np
import pandas as pd
import math
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import L2, L1
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import KFold

## Feature Engineer

In [21]:
def FFT(xreal, ximag):    
    n = 2
    while(n*2 <= len(xreal)):
        n *= 2
    
    p = int(math.log(n, 2))
    
    for i in range(0, n):
        a = i
        b = 0
        for j in range(0, p):
            b = int(b*2 + a%2)
            a = a//2
        if(b > i):
            xreal[i], xreal[b] = xreal[b], xreal[i]
            ximag[i], ximag[b] = ximag[b], ximag[i]
            
    wreal = []
    wimag = []
        
    arg = float(-2 * math.pi / n)
    treal = float(math.cos(arg))
    timag = float(math.sin(arg))
    
    wreal.append(float(1.0))
    wimag.append(float(0.0))
    
    for j in range(1, int(n/2)):
        wreal.append(wreal[-1] * treal - wimag[-1] * timag)
        wimag.append(wreal[-1] * timag + wimag[-1] * treal)
        
    m = 2
    while(m < n + 1):
        for k in range(0, n, m):
            for j in range(0, int(m/2), 1):
                index1 = k + j
                index2 = int(index1 + m / 2)
                t = int(n * j / m)
                treal = wreal[t] * xreal[index2] - wimag[t] * ximag[index2]
                timag = wreal[t] * ximag[index2] + wimag[t] * xreal[index2]
                ureal = xreal[index1]
                uimag = ximag[index1]
                xreal[index1] = ureal + treal
                ximag[index1] = uimag + timag
                xreal[index2] = ureal - treal
                ximag[index2] = uimag - timag
        m *= 2
        
    return n, xreal, ximag   

In [22]:
def FFT_data(input_data, swinging_times):   
    txtlength = swinging_times[-1] - swinging_times[0]
    a_mean = [0] * txtlength
    g_mean = [0] * txtlength
       
    for num in range(len(swinging_times)-1):
        a = []
        g = []
        for swing in range(swinging_times[num], swinging_times[num+1]):
            a.append(math.sqrt(math.pow((input_data[swing][0] + input_data[swing][1] + input_data[swing][2]), 2)))
            g.append(math.sqrt(math.pow((input_data[swing][3] + input_data[swing][4] + input_data[swing][5]), 2)))

        a_mean[num] = (sum(a) / len(a))
        g_mean[num] = (sum(g) / len(g)) # flag
    
    return a_mean, g_mean

In [23]:
def feature(input_data, swinging_now, swinging_times, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer):
    allsum = []
    mean = []
    var = []
    rms = []
    XYZmean_a = 0
    a = []
    g = []
    a_s1 = 0
    a_s2 = 0
    g_s1 = 0
    g_s2 = 0
    a_k1 = 0
    a_k2 = 0
    g_k1 = 0
    g_k2 = 0
    
    for i in range(len(input_data)):
        if i==0:
            allsum = input_data[i]
            a.append(math.sqrt(math.pow((input_data[i][0] + input_data[i][1] + input_data[i][2]), 2)))
            g.append(math.sqrt(math.pow((input_data[i][3] + input_data[i][4] + input_data[i][5]), 2)))
            continue
        
        a.append(math.sqrt(math.pow((input_data[i][0] + input_data[i][1] + input_data[i][2]), 2)))
        g.append(math.sqrt(math.pow((input_data[i][3] + input_data[i][4] + input_data[i][5]), 2)))
       
        allsum = [allsum[feature_index] + input_data[i][feature_index] for feature_index in range(len(input_data[i]))]
        
    mean = [allsum[feature_index] / len(input_data) for feature_index in range(len(input_data[i]))]
    
    for i in range(len(input_data)):
        if i==0:
            var = input_data[i]
            rms = input_data[i]
            continue

        var = [var[feature_index] + math.pow((input_data[i][feature_index] - mean[feature_index]), 2) for feature_index in range(len(input_data[i]))]
        rms = [rms[feature_index] + math.pow(input_data[i][feature_index], 2) for feature_index in range(len(input_data[i]))]
    
    # flag
    var = [var[feature_index] if var[feature_index] > 0 else 0 for feature_index in range(len(input_data[i]))]
    
    var = [math.sqrt((var[feature_index] / len(input_data))) for feature_index in range(len(input_data[i]))]
    rms = [math.sqrt((rms[feature_index] / len(input_data))) for feature_index in range(len(input_data[i]))]
    
    a_max = [max(a)]
    a_min = [min(a)]
    a_mean = [sum(a) / len(a)]
    g_max = [max(g)]
    g_min = [min(g)]
    g_mean = [sum(g) / len(g)]
    
    a_var = math.sqrt(math.pow((var[0] + var[1] + var[2]), 2))
    
    for i in range(len(input_data)):
        a_s1 = a_s1 + math.pow((a[i] - a_mean[0]), 4)
        a_s2 = a_s2 + math.pow((a[i] - a_mean[0]), 2)
        g_s1 = g_s1 + math.pow((g[i] - g_mean[0]), 4)
        g_s2 = g_s2 + math.pow((g[i] - g_mean[0]), 2)
        a_k1 = a_k1 + math.pow((a[i] - a_mean[0]), 3)
        g_k1 = g_k1 + math.pow((g[i] - g_mean[0]), 3)
    
    a_s1 = a_s1 / len(input_data)
    a_s2 = a_s2 / len(input_data)
    g_s1 = g_s1 / len(input_data)
    g_s2 = g_s2 / len(input_data)
    a_k2 = math.pow(a_s2, 1.5)
    g_k2 = math.pow(g_s2, 1.5)
    a_s2 = a_s2 * a_s2
    g_s2 = g_s2 * g_s2
    
    a_kurtosis = [a_s1 / a_s2]
    g_kurtosis = [g_s1 / g_s2]
    a_skewness = [a_k1 / a_k2]
    g_skewness = [g_k1 / g_k2]
    
    a_fft_mean = 0
    g_fft_mean = 0
    cut = int(n_fft / swinging_times)
    a_psd = []
    g_psd = []
    entropy_a = []
    entropy_g = []
    e1 = []
    e3 = []
    e2 = 0
    e4 = 0
    
    for i in range(cut * swinging_now, cut * (swinging_now + 1)):
        a_fft_mean += a_fft[i]
        g_fft_mean += g_fft[i]
        a_psd.append(math.pow(a_fft[i], 2) + math.pow(a_fft_imag[i], 2))
        g_psd.append(math.pow(g_fft[i], 2) + math.pow(g_fft_imag[i], 2))
        e1.append(math.pow(a_psd[-1], 0.5))
        e3.append(math.pow(g_psd[-1], 0.5))
        
    a_fft_mean = a_fft_mean / cut
    g_fft_mean = g_fft_mean / cut
    
    a_psd_mean = sum(a_psd) / len(a_psd)
    g_psd_mean = sum(g_psd) / len(g_psd)
    
    for i in range(cut):
        e2 += math.pow(a_psd[i], 0.5)
        e4 += math.pow(g_psd[i], 0.5)
    
    for i in range(cut):
        entropy_a.append((e1[i] / e2) * math.log(e1[i] / e2))
        entropy_g.append((e3[i] / e4) * math.log(e3[i] / e4))
    
    a_entropy_mean = sum(entropy_a) / len(entropy_a)
    g_entropy_mean = sum(entropy_g) / len(entropy_g)       
        
    
    output = mean + var + rms + a_max + a_mean + a_min + g_max + g_mean + g_min + [a_fft_mean] + [g_fft_mean] + [a_psd_mean] + [g_psd_mean] + a_kurtosis + g_kurtosis + a_skewness + g_skewness + [a_entropy_mean] + [g_entropy_mean]
    writer.writerow(output)

In [24]:
def data_generate(datapath = './train_data', tar_dir = 'tabular_data_train'):
    pathlist_txt = Path(datapath).glob('**/*.txt')

    
    for file in pathlist_txt:
        f = open(file)

        All_data = []

        count = 0
        for line in f.readlines():
            if line == '\n' or count == 0:
                count += 1
                continue
            num = line.split(' ')
            if len(num) > 5:
                tmp_list = []
                for i in range(6):
                    tmp_list.append(int(num[i]))
                All_data.append(tmp_list)
        
        f.close()

        swing_index = np.linspace(0, len(All_data), 28, dtype = int)
        # filename.append(int(Path(file).stem))
        # all_swing.append([swing_index])

        headerList = ['ax_mean', 'ay_mean', 'az_mean', 'gx_mean', 'gy_mean', 'gz_mean', 'ax_var', 'ay_var', 'az_var', 'gx_var', 'gy_var', 'gz_var', 'ax_rms', 'ay_rms', 'az_rms', 'gx_rms', 'gy_rms', 'gz_rms', 'a_max', 'a_mean', 'a_min', 'g_max', 'g_mean', 'g_min', 'a_fft', 'g_fft', 'a_psd', 'g_psd', 'a_kurt', 'g_kurt', 'a_skewn', 'g_skewn', 'a_entropy', 'g_entropy']                
        

        with open('./{dir}/{fname}.csv'.format(dir = tar_dir, fname = Path(file).stem), 'w', newline = '') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headerList)
            # try:
            #     a_fft, g_fft = FFT_data(All_data, swing_index)
            #     a_fft_imag = [0] * len(a_fft)
            #     g_fft_imag = [0] * len(g_fft)
            #     n_fft, a_fft, a_fft_imag = FFT(a_fft, a_fft_imag)
            #     n_fft, g_fft, g_fft_imag = FFT(g_fft, g_fft_imag)
            #     for i in range(len(swing_index)):
            #         if i==0:
            #             continue
            #         feature(All_data[swing_index[i-1]: swing_index[i]], i - 1, len(swing_index) - 1, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer)
            # except:
            #     print(Path(file).stem)
            #     continue
            a_fft, g_fft = FFT_data(All_data, swing_index)
            a_fft_imag = [0] * len(a_fft)
            g_fft_imag = [0] * len(g_fft)
            n_fft, a_fft, a_fft_imag = FFT(a_fft, a_fft_imag)
            n_fft, g_fft, g_fft_imag = FFT(g_fft, g_fft_imag)
            for i in range(len(swing_index)):
                if i==0:
                    continue
                feature(All_data[swing_index[i-1]: swing_index[i]], i - 1, len(swing_index) - 1, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer)
            

## Training

In [25]:
def roc(multi_label=False):
    return AUC(name='auc', curve='ROC', multi_label=multi_label)

In [26]:
gender_model = Sequential([
    # layers.Dense(200, activation='relu'), 
    # layers.Dropout(0.5), 
    layers.Dense(25, activation='relu'), 
    layers.Dropout(0.75), 
    layers.Dense(2, activation='sigmoid'), 
])
gender_model.compile(optimizer=RMSprop(0.01), loss='categorical_crossentropy', metrics=['acc', roc()])

hand_model = Sequential([
    layers.Dense(120, activation='relu'),
    layers.Dropout(0.5), 
    layers.Dense(50, activation='relu'), 
    layers.Dropout(0.5), 
    layers.Dense(2, activation='sigmoid'), 
])
hand_model.compile(loss='categorical_crossentropy', metrics=['acc', roc()])

In [27]:
reg = L2(0.0005)
level_model = Sequential([
    layers.Dense(120, activation='relu', kernel_regularizer=reg),
    layers.Dropout(0.5), 
    layers.Dense(60, activation='relu', kernel_regularizer=reg), 
    layers.Dropout(0.5), 
    layers.Dense(10, activation='relu', kernel_regularizer=reg), 
    layers.Dropout(0.5), 
    layers.Dense(4, activation='softmax'), 
])
level_model.compile(optimizer=RMSprop(0.001), loss='categorical_crossentropy', metrics=['acc', roc(True)])

year_model = Sequential([
    layers.Input((35)),
    layers.Dense(25, activation='relu'), 
    layers.Dropout(0.5), 
    layers.Dense(3, activation='softmax'), 
])
year_model.compile(loss='categorical_crossentropy', metrics=['acc', roc(True)])

### Setting Callbacks

In [28]:
def early(patience):
    return EarlyStopping(monitor='val_auc', patience=patience, mode='max', restore_best_weights=True)
scheduler = ReduceLROnPlateau(monitor='val_auc', patience=2, mode='max', factor=math.exp(-0.1))

### Functions

In [29]:
def display_corr(X_data, y_data, name=''):
    corr_gender = X_data.apply(lambda col: col.corr(y_data[name]))
    corr_gender = corr_gender.sort_values(ascending=False)
    print(name, 'corr')
    print(corr_gender.head(10))  # Top 10 正相關
    print(corr_gender.tail(10))  # Top 10 負相關

    # display(corr_gender)

In [30]:
# Hyper-parameter

In [31]:
def model_binary(X, y, group_size=27, n_splits=5, n_estimators=100, 
                        model=None, epochs=1, batch_size=32, class_weight=None, patience=5):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = model  # 注意：你如果是要每折重新初始化模型，這裡要改

        clf.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test),
                class_weight=class_weight, verbose=0,
                callbacks=[early(patience), scheduler])

        predicted = clf.predict(X_test)
        predicted = [predicted[i][1] for i in range(len(predicted))]

        num_groups = len(predicted) // group_size
        if sum(predicted[:group_size]) / group_size > 0.5:
            y_pred = [max(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]
        else:
            y_pred = [min(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]

        y_test_agg = [y_test[i*group_size][1] for i in range(num_groups)]

        auc_score = roc_auc_score(y_test_agg, y_pred, average='micro')
        print(f'Fold ROC {auc_score:.4f}')
        auc_scores.append(auc_score)

    print(f'Mean ROC AUC over {n_splits} folds: {sum(auc_scores)/len(auc_scores):.4f}')

In [32]:
def model_multiary(X, y, group_size=27, n_splits=5, n_estimators=100,
                   model=None, epochs=1, batch_size=32, class_weight=None, patience=5):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = model  # 同樣地：如果想每折重建模型可以另外處理

        clf.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test),
                class_weight=class_weight, verbose=0,
                callbacks=[early(patience), scheduler])

        predicted = clf.predict(X_test)

        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            group_pred = predicted[i*group_size: (i+1)*group_size]
            num_classes = len(np.unique(y_train))
            class_sums = [sum([group_pred[k][j] for k in range(group_size)]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(group_size)]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])

        y_test_agg = [y_test[i*group_size] for i in range(num_groups)]

        auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
        print(f'Fold Multiary AUC: {auc_score:.4f}')
        auc_scores.append(auc_score)

    print(f'Mean Multiary AUC over {n_splits} folds: {sum(auc_scores)/len(auc_scores):.4f}')


In [33]:
def main():
    global year_model, gender_model, level_model, hand_model
    # 若尚未產生特徵，請先執行 data_generate() 生成特徵 CSV 檔案
    # data_generate()
    
    # 讀取訓練資訊，根據 player_id 將資料分成 80% 訓練、20% 測試
    info = pd.read_csv('train_info.csv')
    unique_players = info['player_id'].unique()
    # train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)
    
    # 讀取特徵 CSV 檔（位於 "./tabular_data_train"）
    datapath = './tabular_data_train'
    datalist = list(Path(datapath).glob('**/*.csv'))
    target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=target_mask)
    # X_test = pd.DataFrame()
    # y_test = pd.DataFrame(columns=target_mask)
    
    for file in datalist:
        unique_id = int(Path(file).stem)
        row = info[info['unique_id'] == unique_id]
        if row.empty:
            continue
        player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        mode = info.loc[info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        # mode_onehot[mode - 1] = 1 # mode : 1-10
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        # from GPT ==================================
        # 查看目前有幾列
        num_rows = len(data)

        # 如果不足 27 列，就補 0 到達 27 列
        if num_rows < 27:
            print('bug')
            num_missing = 27 - num_rows
            missing_rows = pd.DataFrame(0, index=range(num_missing), columns=data.columns)
            data = pd.concat([data, missing_rows], ignore_index=True)
        # from GPT ==================================

        target = row[target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_train = pd.concat([X_train, data], ignore_index=True)
        y_train = pd.concat([y_train, target_repeated], ignore_index=True)
    print('train shape', X_train.shape)
    X_train.columns = X_train.columns.astype(str)
    
    # 標準化特徵
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
        elif mode == 1:
            X_train_func = X_train[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train_func)
        return X_train_scaled
    # =====================================================================================
    # display_corr(X_train, y_train, 'level')
    total_num = len(y_train)

    # 評分：針對各目標進行模型訓練與評分
    X_train_scaled = normalize('gender', 10, 2)
    y_train_le_gender = to_categorical(le.fit_transform(y_train['gender']), 2)
    class_weight = [(y_train['gender'] == 1).sum(), (y_train['gender'] == 2).sum()]
    class_weight = {idx: total_num / num for idx, num in enumerate(class_weight)}
    model_binary(X_train_scaled, y_train_le_gender, 
                 model=gender_model, class_weight=class_weight, epochs=200, patience=7)
    
    X_train_scaled = normalize('hold racket handed', mode=0)
    y_train_le_hold = to_categorical(le.fit_transform(y_train['hold racket handed']), 2)
    class_weight = [(y_train['hold racket handed'] == 1).sum(), (y_train['hold racket handed'] == 2).sum()]
    class_weight = {idx: total_num / num for idx, num in enumerate(class_weight)}
    model_binary(X_train_scaled, y_train_le_hold, 
                 model=hand_model, class_weight=class_weight, epochs=20, patience=2)
    
    X_train_scaled = normalize('play years', 15, mode=0)
    y_train_le_years = to_categorical(y_train['play years'], 3)
    class_weight = [(y_train['play years'] == 0).sum(), (y_train['play years'] == 1).sum(), (y_train['play years'] == 2).sum()]
    class_weight = {idx: total_num / num for idx, num in enumerate(class_weight)}
    model_multiary(X_train_scaled, y_train_le_years,
                   model=year_model, class_weight=class_weight, epochs=20, patience=7)
    
    X_train_scaled = normalize('level', 15, mode=0)
    y_train_le_level = to_categorical(le.fit_transform(y_train['level']), 4)
    class_weight = [(y_train['level'] == 2).sum(), (y_train['level'] == 3).sum(), (y_train['level'] == 4).sum(), (y_train['level'] == 5).sum()]
    class_weight = {idx: total_num / num for idx, num in enumerate(class_weight)}
    model_multiary(X_train_scaled, y_train_le_level,
                   model=level_model, class_weight=class_weight, epochs=40)

    #AUC SCORE: 0.792(gender) + 0.998(hold) + 0.660(years) + 0.822(levels)


 - Test 好像有缺失值
### Start

In [34]:
# data_generate('./train_data', 'tabular_data_train')

In [35]:
main()

train shape (52785, 35)
Fold ROC 0.6783
Fold ROC 0.6000
Fold ROC 0.6445
Fold ROC 0.6376
Fold ROC 0.6271
Mean ROC AUC over 5 folds: 0.6375
Fold ROC 0.8965
Fold ROC 0.8884
Fold ROC 0.8410
Fold ROC 0.8383
Fold ROC 0.8529
Mean ROC AUC over 5 folds: 0.8634
Fold Multiary AUC: 0.7033
Fold Multiary AUC: 0.7700
Fold Multiary AUC: 0.7557
Fold Multiary AUC: 0.7625
Fold Multiary AUC: 0.7451
Mean Multiary AUC over 5 folds: 0.7473
Fold Multiary AUC: 0.8736
Fold Multiary AUC: 0.8793
Fold Multiary AUC: 0.8746
Fold Multiary AUC: 0.8690
Fold Multiary AUC: 0.8804
Mean Multiary AUC over 5 folds: 0.8754


## Predict Submit

In [36]:
def make_submit():   
    train_target_mask = ['gender', 'hold racket handed', 'play years', 'level']
    test_target_mask = ['gender', 'hold racket handed']
    
    # Train Data
    train_datapath = './tabular_data_train'
    train_datalist = list(Path(train_datapath).glob('**/*.csv'))
    
    train_info = pd.read_csv('train_info.csv')
    train_players = train_info['player_id'].unique()
    
    # Test Data
    test_datapath = './tabular_data_test'
    test_datalist = list(Path(test_datapath).glob('**/*.csv'))
    
    test_info = pd.read_csv('test_info.csv')
    test_players = test_info['unique_id']
    
    # 根據 test_players 分組資料
    X_train = pd.DataFrame()
    y_train = pd.DataFrame(columns=train_target_mask)
    X_test = pd.DataFrame()
    y_test = pd.DataFrame(columns=['unique_id'] + test_target_mask)
    y_test['unique_id'] = test_players
    
    # Make Train DF
    for file in train_datalist:
        unique_id = int(Path(file).stem)
        row = train_info[train_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        
        mode = train_info.loc[train_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        target = row[train_target_mask]
        target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_train = pd.concat([X_train, data], ignore_index=True)
        y_train = pd.concat([y_train, target_repeated], ignore_index=True)

    # Make Test DF
    for file in test_datalist:
        unique_id = int(Path(file).stem)
        row = test_info[test_info['unique_id'] == unique_id]
        if row.empty:
            continue
        # player_id = row['player_id'].iloc[0]
        data = pd.read_csv(file)
        if data.empty:
            print(file)
            
        mode = test_info.loc[test_info['unique_id'] == unique_id, 'mode'].values[0] # a scalar
        mode_onehot = np.zeros((1))
        mode_onehot[0] = 1 if mode >= 9 else 0
        mode_onehot = pd.DataFrame([mode_onehot] * len(data))
        
        # target = row[target_mask]
        # target_repeated = pd.concat([target] * len(data))
        data = pd.concat([data, mode_onehot], axis=1)
        X_test = pd.concat([X_test, data], ignore_index=True)
        # y_test = pd.concat([y_test, target_repeated], ignore_index=True)
    print(X_train.shape)
    print(X_test.shape)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    
    le = LabelEncoder()
    def normalize(name, bound=10, mode=0):
        X_corr = X_train.apply(lambda col: col.corr(y_train[name]))
        X_corr = X_corr.sort_values(ascending=False)
        
        if mode == 1: # pos
            columns = X_corr.head(bound).index.tolist()
        elif mode == 2: # neg
            columns = X_corr.tail(bound).index.tolist()
        
        if mode == 0:
            X_train_func = X_train
            X_test_func = X_test
        elif mode == 1:
            X_train_func = X_train[columns]
            X_test_func = X_test[columns]
        elif mode == 2:
            X_train_func = X_train.drop(columns=columns)
            X_test_func = X_test.drop(columns=columns)
            
        scaler = MinMaxScaler()
        scaler.fit(X_train_func)
        X_test_scaled = scaler.transform(X_test_func)
        return X_test_scaled
    # =====================================================================================
    def model_binary_pred(X_test, y_test, name, group_size=27, model=None):
        clf = model        
        predicted = clf.predict(X_test)
        # 取出正類（index 1）的概率
        predicted = [1 - predicted[i][1] for i in range(len(predicted))]

        
        num_groups = len(predicted) // group_size 
        # print('num group', num_groups)
        if sum(predicted[:group_size]) / group_size > 0.5:
            y_pred = [max(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]
        else:
            y_pred = [min(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]
            
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        y_pred = [round(y_pred[i], 3) for i in range(len(y_pred))]
        y_test[name] = y_pred
        
    # 定義多類別分類評分函數 (例如 play years、level)
    def model_multiary_pred(X_test, y_test, name:list, group_size=27, model=None):
        clf = model
        predicted = clf.predict(X_test)
        num_groups = len(predicted) // group_size
        y_pred = []
        for i in range(num_groups):
            group_pred = predicted[i*group_size: (i+1)*group_size]
            num_classes = len(name)
            # 對每個類別計算該組內的總機率
            class_sums = [sum([group_pred[k][j] for k in range(group_size)]) for j in range(num_classes)]
            chosen_class = np.argmax(class_sums)
            candidate_probs = [group_pred[k][chosen_class] for k in range(group_size)]
            best_instance = np.argmax(candidate_probs)
            y_pred.append(group_pred[best_instance])
        y_pred = np.array(y_pred)
        # print(y_pred.shape)
        for i, n in enumerate(name):
            answer = [round(y_pred[j, i], 3) for j in range(len(y_pred))]
            y_test[n] = answer
        
        # y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
        # auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
        # print('Multiary AUC:', auc_score)
        
    # =====================================================================================
    # 評分：針對各目標進行模型訓練與評分
    print('Start Prediction')
    X_test_scaled = normalize('gender', 10, 2)
    model_binary_pred(X_test_scaled, y_test, 'gender', model=gender_model)
    
    X_test_scaled = normalize('hold racket handed', mode=0)
    model_binary_pred(X_test_scaled, y_test, 'hold racket handed', model=hand_model)
    
    X_test_scaled = normalize('play years', 10, mode=0)
    labels = ['play years_0', 'play years_1', 'play years_2']
    model_multiary_pred(X_test_scaled, y_test, labels, model=year_model)
    
    X_test_scaled = normalize('level', 10, mode=0)
    labels = ['level_2', 'level_3', 'level_4', 'level_5']
    model_multiary_pred(X_test_scaled, y_test, labels, model=level_model)
    
    y_test.to_csv('submit.csv', index=False)
    print('End')

### Start

In [37]:
make_submit()

(52785, 35)
(38610, 35)
Start Prediction
End
