In [5]:
target_player = '孫穎莎(RSH)'

#合併多場比賽之數據

In [6]:
import pandas as pd
import os

def merge_csv_files():
    
    input_directory = f"{target_player}_data_file"

    upper_two_levels = os.path.dirname(os.path.dirname(input_directory))

    # 合併輸出檔案路徑
    output_file = os.path.join(upper_two_levels, f"merge_data/{target_player}_merged_data.csv")
    
    # 存儲所有資料的列表
    all_data = []
    
    # 遍歷目錄中的所有CSV檔案
    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_directory, filename)
            
            # 讀取CSV檔案，跳過第一列
            df = pd.read_csv(file_path, skiprows=1)
            
            # 將資料添加到列表中
            all_data.append(df)
    
    # 合併所有資料
    if all_data:
        merged_data = pd.concat(all_data, ignore_index=True)
        
        # 儲存合併後的資料，不包含標題列
        merged_data.to_csv(output_file, index=False, header=False, encoding="utf-8-sig")
        print(f"檔案已合併並儲存至: {output_file}")
    else:
        print("未找到CSV檔案")

# 執行程式
if __name__ == "__main__":
    merge_csv_files()

檔案已合併並儲存至: merge_data/孫穎莎(RSH)_merged_data.csv


#提取指定選手的資料(包含變線處理、階段、得分狀態、遺失值處理) -> 輸出為{target_player}_data

In [7]:
def get_area(point):
    if point in [3, 6]:
        return "左手區"
    elif point in [2, 5]:
        return "中路區"
    elif point in [1, 4]:
        return "反手區"
    return "其他"

def extract_player_data(file_path, target_player):
    df = pd.read_csv(file_path, header=None)
    player_data = []
    
    for _, row in df.iterrows():
        basic_info = row[:7].values
        is_server = row[2] == target_player
        current_data = row[7:].values
        rally_count = 1
        prev_opponent_landing = None
        
        for i in range(0, len(current_data), 7):
            if pd.isna(current_data[i]) or str(current_data[i]).startswith(('S', 'SG', 'SM')):
                break
            
            should_collect = (is_server and rally_count % 2 == 1) or (not is_server and rally_count % 2 == 0)
            if should_collect and i+6 < len(current_data):
                try:
                    landing_point = int(float(current_data[i+1])) if str(current_data[i+1]).replace('.','').isdigit() else current_data[i+1]
                    opponent_landing = int(float(current_data[i+2])) if str(current_data[i+2]).replace('.','').isdigit() else current_data[i+2]
                    technique = int(float(current_data[i+3])) if str(current_data[i+3]).replace('.','').isdigit() else current_data[i+3]
                    speed = int(float(current_data[i+4])) if str(current_data[i+4]).replace('.','').isdigit() else current_data[i+4]
                    spin = int(float(current_data[i+5])) if str(current_data[i+5]).replace('.','').isdigit() else current_data[i+5]
                    length = int(float(current_data[i+6])) if str(current_data[i+6]).replace('.','').isdigit() else current_data[i+6]
                except:
                    continue
                
                if rally_count == 1:
                    line_change = "未變線"
                elif prev_opponent_landing is not None and isinstance(prev_opponent_landing, (int, float)):
                    prev_area = get_area(prev_opponent_landing)
                    current_area = get_area(opponent_landing)
                    if current_area == "其他":
                        line_change = "未變線"
                    else:
                        line_change = f"{prev_area}->{current_area}" if prev_area != current_area else "未變線"
                else:
                    line_change = "未變線"
                
                data_dict = {
                    '局數': basic_info[0],
                    '球序': basic_info[1],
                    '發球者姓名': basic_info[2],
                    '比分': basic_info[3],
                    '勝負': basic_info[4],
                    '板數': basic_info[5],
                    '發球者代碼': basic_info[6],
                    '板序': rally_count,
                    '正反手': current_data[i],
                    '己方落點': landing_point,
                    '對方落點': opponent_landing,
                    '技術': technique,
                    '速度': speed,
                    '旋轉': spin,
                    '長短': length,
                    '變線': line_change
                }
                player_data.append(data_dict)
            else:
                try:
                    prev_opponent_landing = int(float(current_data[i+1])) if str(current_data[i+1]).replace('.','').isdigit() else None
                except:
                    prev_opponent_landing = None
            
            rally_count += 1
    
    result_df = pd.DataFrame(player_data)
    return result_df


output_file = f"all_player_data/{target_player}_data.csv"
df = extract_player_data(f"merge_data/{target_player}_merged_data.csv", target_player)

# 記錄篩選前的資料筆數
before_count = len(df)
# 檢查並移除含有9的資料
columns_to_check = ['正反手', '己方落點', '對方落點', '技術', '速度', '旋轉' , '長短']
df = df[~df[columns_to_check].isin([9]).any(axis=1)]
# 記錄篩選後的資料筆數
after_count = len(df)

# 新增階段欄位
df['階段'] = '相持段'  # 預設值為相持段

# 當選手是發球者時的階段判斷
mask_server = (df['發球者姓名'] == target_player) & (df['板序'].isin([1, 3])) & (df['板數'] < 5)
df.loc[mask_server, '階段'] = '發球搶攻段'

# 當選手是接發球者時的階段判斷
mask_receiver = (df['發球者姓名'] != target_player) & (df['板序'].isin([2, 4])) & (df['板數'] < 5)
df.loc[mask_receiver, '階段'] = '接發球搶攻段'

# 新增得分欄位，預設為0
df['得分'] = 0

# 設定得分條件
# 條件1：目標選手是發球者且勝負為1
mask_server_win = (df['發球者姓名'] == target_player) & (df['勝負'] == 1)
df.loc[mask_server_win, '得分'] = 1

# 條件2：目標選手不是發球者且勝負為0
mask_receiver_win = (df['發球者姓名'] != target_player) & (df['勝負'] == 0)
df.loc[mask_receiver_win, '得分'] = 1

# 輸出統計資訊
print(f"目標選手: {target_player}")
print(f"篩選前資料筆數: {before_count}")
print(f"移除的資料筆數: {before_count - after_count}")
print(f"篩選後剩餘筆數: {after_count}")
# 儲存結果
df.to_csv(output_file, index=False)

目標選手: 孫穎莎(RSH)
篩選前資料筆數: 1364
移除的資料筆數: 3
篩選後剩餘筆數: 1361


#編碼轉換回原名稱 -> 輸出為{target_player}_data_converted

In [10]:
import pandas as pd

def convert_technique(row):
    tech = row['技術']
    
    try:
        tech = int(tech)
    except:
        pass
        
    if row['板序'] == 1:  # 發球
        tech_map = {
            1: '低拋式',
            2: '高拋式',
            3: '下蹲式'
        }
    else:  # 接球
        tech_map = {
            1: '拉球',
            2: '反拉',
            3: '殺球',
            4: '擰球',
            5: '快帶',
            6: '推擠',
            7: '挑撥',
            8: '搓球',
            'A': '擺短',
            'B': '擋球',
            'C': '放高',
            'D': '削球',
            'E': '側切'
        }
    return tech_map.get(tech, '未知技術')

def convert_speed(speed):
    # 嘗試轉換為數字
    try:
        speed = int(speed)
    except:
        pass
        
    speed_map = {
        1: '弱',
        2: '普通',
        3: '強'
    }
    return speed_map.get(speed, '未知速度')

def convert_spin(spin):
    # 嘗試轉換為數字
    try:
        spin = int(spin)
    except:
        pass
        
    spin_map = {
        1: '上旋',
        2: '下旋',
        3: '不旋',
        4: '側上旋',
        5: '側下旋'
    }
    return spin_map.get(spin, '未知旋轉')

def convert_length(length):
    # 嘗試轉換為數字
    try:
        length = int(length)
    except:
        pass
        
    length_map = {
        1: '正手短球',
        2: '中路短球',
        3: '反手短球',
        4: '正手半出台',
        5: '中路半出台',
        6: '反手半出台',
        7: '正手長球',
        8: '中路長球',
        'A': '反手長球',
        'O': '出界',
        'T': '掛網'
    }
    return length_map.get(length, '未知長短')


def determine_hand(row):

    length_description = row['正反手']
    
    if 'F' in length_description:
        return '正手'  # 正手 (Forehand)
    elif 'B' in length_description:
        return '反手'  # 反手 (Backhand)
    else:
        return '未知'  # 無法判斷

# 讀取CSV檔案

file_path = f"all_player_data/{target_player}_data.csv"
df = pd.read_csv(file_path)

# 轉換各欄位
df['技術'] = df.apply(convert_technique, axis=1)
df['速度'] = df['速度'].apply(convert_speed)
df['旋轉'] = df['旋轉'].apply(convert_spin)
df['長短'] = df['長短'].apply(convert_length)
df['正反手'] = df.apply(determine_hand, axis=1)
# 儲存轉換後的結果
output_file = f"converted_data/{target_player}_data_converted.csv"
df.to_csv(output_file, index=False)

print(f"目標選手: {target_player}")
print(f"轉換資料筆數: {len(df)}")

目標選手: 孫穎莎(RSH)
轉換資料筆數: 1361
