In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# set traditional Chinese display
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei']

### Pre-process
* 整理成以下形式: hit_player_x, hit_player_y, partner_x, partner_y, ball_type
* 同時統計兩方的站位，沒有擊球者，球種欄填"未擊球"
* 整理成XML的形式

In [8]:
# Load datasets
data = pd.read_csv('../dataset/doubles 2/doubles/convert_shot.csv', encoding='utf-8')
rally_data = pd.read_csv('../dataset/doubles 2/doubles/rally.csv', encoding='utf-8')
set_data = pd.read_csv('../dataset/doubles 2/doubles/set.csv', encoding='utf-8')
match_data = pd.read_csv('../dataset/doubles 2/doubles/match.csv', encoding='utf-8')

# Merge rally_data with set_data to include match_id
rally_set_merged = rally_data.merge(set_data[['set_id', 'match_id']], on='set_id', how='left')

# Merge rally_set_merged with match_data to include player info
rally_set_match_merged = rally_set_merged.merge(match_data[['match_id', 'win_A', 'win_B', 'lose_C', 'lose_D']], on='match_id', how='left')

# Merge with the main convert_shot data
data_merged = data.merge(rally_set_match_merged[['rally_id', 'win_A', 'win_B', 'lose_C', 'lose_D']], on='rally_id', how='left')
data_merged = data_merged[data_merged['rally_id']>=1748]
# Initialize columns for hit_player and partner positions
data_merged['hit_player_x'] = None
data_merged['hit_player_y'] = None
data_merged['partner_x'] = None
data_merged['partner_y'] = None

# Function to assign hit_player and partner coordinates based on player info
def assign_player_positions(row):
    if row['player'] == row['win_A']:
        row['hit_player_x'] = row['player_A_x']
        row['hit_player_y'] = row['player_A_y']
        row['partner_x'] = row['player_B_x']
        row['partner_y'] = row['player_B_y']
    elif row['player'] == row['win_B']:
        row['hit_player_x'] = row['player_B_x']
        row['hit_player_y'] = row['player_B_y']
        row['partner_x'] = row['player_A_x']
        row['partner_y'] = row['player_A_y']
    elif row['player'] == row['lose_C']:
        row['hit_player_x'] = row['player_C_x']
        row['hit_player_y'] = row['player_C_y']
        row['partner_x'] = row['player_D_x']
        row['partner_y'] = row['player_D_y']
    elif row['player'] == row['lose_D']:
        row['hit_player_x'] = row['player_D_x']
        row['hit_player_y'] = row['player_D_y']
        row['partner_x'] = row['player_C_x']
        row['partner_y'] = row['player_C_y']
    if row['hit_player_y'] < 67:
        row['hit_player_y'] = 134 - row['hit_player_y']
        row['hit_player_x'] = 61 - row['hit_player_x']
        row['partner_x'] = 61 - row['partner_x']
        row['partner_y'] = 134 - row['partner_y']
    return row

# Apply the function to each row
data_merged = data_merged.apply(assign_player_positions, axis=1)
data_merged.drop(['start_frame_num','end_frame_num','time','return_height'], axis=1, inplace=True)
# Save the updated data
data_merged.to_csv('../output/Week12/updated_convert_shot.csv', index=False, encoding='utf-8')

# Check the result
print(data_merged.head())


       shot_id  rally_id  shot_num  player ball_type  backhand  player_A_x  \
24405    24406      1748         1      52       發短球         1   21.872319   
24406    24407      1748         2      54        推球         1   23.457616   
24407    24408      1749         1      52       發短球         1   26.085880   
24408    24409      1749         2      55        推球         0   25.663693   
24409    24410      1749         3      53        平球         1   26.340867   

       player_A_y  player_B_x  player_B_y  ...  hit_height  flaw  win_A  \
24405   96.543075   35.871442  102.365643  ...           2     0     54   
24406   92.645605   35.871442  102.365643  ...           1     0     54   
24407  103.273571   44.364038   95.249722  ...           2     0     54   
24408  103.732400   42.034566   86.793324  ...           2     0     54   
24409  104.655055   41.686918   88.030965  ...           1     0     54   

       win_B  lose_C  lose_D  hit_player_x  hit_player_y  partner_x  \
24405    

In [11]:
def assign_player_positions(row, prev_hit_player, current_hit_player):
    hit = prev_hit_player[0]
    partner = prev_hit_player[1]
    c_hit = current_hit_player[0]
    c_partner = current_hit_player[1]
    return row[f'player_{c_hit}_x'], row[f'player_{c_hit}_y'], row[f'player_{c_partner}_x'], row[f'player_{c_partner}_y'], row[f'player_{hit}_x'], row[f'player_{hit}_y'], row[f'player_{partner}_x'], row[f'player_{partner}_y']


In [12]:
def player(row):
    if row['player'] == row['win_A']:
        return ["A", "B"]
    elif row['player'] == row['win_B']:
        return ["B", "A"]
    elif row['player'] == row['lose_C']:
        return ["C", "D"]
    elif row['player'] == row['lose_D']:
        return ["D", "C"]
    else:
        return None

In [15]:
# Load the datasets
# convert_shot.csv: The result after running transform.py
data = pd.read_csv('../output/Week12/updated_convert_shot.csv', encoding='utf-8')
rally_data = pd.read_csv('../dataset/doubles 2/doubles/rally.csv', encoding='utf-8')
set_data = pd.read_csv('../dataset/doubles 2/doubles/set.csv', encoding='utf-8')

# Filter players
players_filtered = data

# Get the maximum shot number for each rally
max_shots_per_rally = players_filtered.groupby('rally_id')['shot_num'].max().reset_index()
max_shots_per_rally.columns = ['rally_id', 'max_shot_num']

# Merge the max_shots_per_rally back to the players_filtered
players_filtered = players_filtered.merge(max_shots_per_rally, on='rally_id', how='left')

# Merge rally_data with set_data to include set_win information
rally_set_merged = rally_data.merge(set_data[['set_id', 'set_win']], on='set_id', how='left')

# Initialize the list to collect rows
shot_list = []


# Initialize previous shot and position variables
prev_hit_player = None

# Iterate over the filtered DataFrame rows
for idx, row in players_filtered.iterrows():
    if row['shot_num'] == 1 or row['max_shot_num'] <= 4:
        # Update previous shot position for the first valid shot
        prev_hit_player = player(row)
        continue
    
    # Check ball type and adjust as necessary
    if row['ball_type'] in ['擋小球', '勾球', '放小球', '小平球']:
        row['ball_type'] = '網前小球'
    elif row['ball_type'] == '防守回挑':
        row['ball_type'] = '挑球'
    elif row['ball_type'] in ['防守回抽', '後場抽平球']:
        row['ball_type'] = '平球'
    elif row['ball_type'] == '過度切球':
        row['ball_type'] = '切球'
    elif row['ball_type'] in ['推球', '撲球']:
        row['ball_type'] = '推撲球'
        
    hit_player_x, hit_player_y, partner_x, partner_y, oppo_hit_x, oppo_hit_y, oppo_partner_x, oppo_partner_y = assign_player_positions(row, prev_hit_player, player(row))
    
    # Prepare the new row
    newrow_hit = {
        "shot_id": row["shot_id"],
        "hit_player_x": hit_player_x,
        "hit_player_y": hit_player_y,
        "partner_x": partner_x,
        "partner_y": partner_y,
        "ball_type": row["ball_type"],
        "score": 0, 
        "score_by_error": 0
    }
    newrow_opponent = {
        "shot_id": row["shot_id"],
        "hit_player_x": oppo_hit_x,
        "hit_player_y": oppo_hit_y,
        "partner_x": oppo_partner_x,
        "partner_y": oppo_partner_y,
        "ball_type": "未擊球",
        "score": 0,
        "score_by_error": 0
    }

    # Adjust for upper court if necessary
    if newrow_hit["hit_player_y"] > 67:
        newrow_hit["hit_player_x"] = 61 - newrow_hit["hit_player_x"]
        newrow_hit["hit_player_y"] = 134 - newrow_hit["hit_player_y"]
        newrow_hit["partner_x"] = 61 - newrow_hit["partner_x"]
        newrow_hit["partner_y"] = 134 - newrow_hit["partner_y"]
        
    if newrow_opponent["hit_player_y"] > 67:
        newrow_opponent["hit_player_x"] = 61 - newrow_opponent["hit_player_x"]
        newrow_opponent["hit_player_y"] = 134 - newrow_opponent["hit_player_y"]
        newrow_opponent["partner_x"] = 61 - newrow_opponent["partner_x"]
        newrow_opponent["partner_y"] = 134 - newrow_opponent["partner_y"]

    # Check if the current shot is one of the last two shots in the rally
    if row['shot_num'] >= row['max_shot_num'] - 2:
        rally_info = rally_set_merged[rally_set_merged['rally_id'] == row['rally_id']].iloc[0]
        set_win = rally_info['set_win']
        score_team = rally_info['score_team']
        if score_team == "NULL":
            continue
            
        if "對手" in str(rally_info['score_reason']):
            newrow_hit["score_by_error"] = 1
            newrow_opponent["score_by_error"] = 1

        # Determine which team won the point
        if (row['shot_num'] == row['max_shot_num']) or (row['shot_num'] == row['max_shot_num'] - 2):
            if ("對手" not in str(rally_info['score_reason'])):
                newrow_hit["score"] = 1
                newrow_opponent["score"] = -1
            else:
                newrow_hit["score"] = -1
                newrow_opponent["score"] = 1
            shot_list.append(newrow_hit)
            shot_list.append(newrow_opponent)
        else:
            if ("對手" not in str(rally_info['score_reason'])):
                newrow_hit["score"] = -1
                newrow_opponent["score"] = 1
            else:
                newrow_hit["score"] = 1
                newrow_opponent["score"] = -1
            shot_list.append(newrow_hit)
            shot_list.append(newrow_opponent)
        

    # Update previous shot and position after each shot for the next iteration
    prev_hit_player = player(row)

# Create DataFrames for win and lose shots
total_df = pd.DataFrame(shot_list)

print("size of total_df: ", total_df.shape[0])

# Save to CSV
total_df.to_csv('../output/Week12/hit.csv', index=False, encoding='utf-8')


size of total_df:  2310


In [17]:
input_file = total_df
# Open a file to write the output in XML format
output_file = '../output/Week12/hit.txt'


with open(output_file, 'w') as f:
    # Write the header
    f.write('<data>\n')
    f.write('<meta>\n')
    f.write('<attributes>\n')
    
    # Write attributes - Adjust this part to match your attributes
    f.write('<attribute><type>Numerical</type><name>hit_player_x</name></attribute>\n')
    f.write('<attribute><type>Numerical</type><name>hit_player_y</name></attribute>\n')
    f.write('<attribute><type>Numerical</type><name>partner_x</name></attribute>\n')
    f.write('<attribute><type>Numerical</type><name>partner_y</name></attribute>\n')
    f.write('<attribute><type>Categorical</type><name>Categorical_1</name></attribute>\n')
    f.write('</attributes>\n')
    f.write('<hierarchy>\n')
    
    # Example hierarchy - Adjust as needed based on actual categorical values
    f.write('<Categorical_1 name="opponent_ball_type">\n')
    f.write('<![CDATA[<殺球></殺球><挑球></挑球><網前小球></網前小球><推撲球></推撲球><長球></長球><平球></平球><切球></切球><未擊球></未擊球>]]>\n')
    f.write('</Categorical_1>\n')
    f.write('</hierarchy>\n')
    f.write('</meta>\n')
    f.write('<class>T</class>\n')
    f.write('<dataset>\n')
    for index, row in data.iterrows():
        if row.isnull().values.any():
            input_file.drop(index, inplace=True)
            continue
        line = str(row['hit_player_x'])+'\t'+str(row['hit_player_y']) +'\t'+str(row['partner_x'])+'\t'+str(row['partner_y']) +'\t'+str(row['ball_type']) + '\n'
        f.write(line)
    # Close the dataset tag
    f.write('</dataset>\n')
    f.write('</data>\n')
total_df = input_file
print(f"Data has been successfully written to {output_file}")


Data has been successfully written to ../output/Week12/hit.txt


### Clustering
* 用ClicoT跑

In [None]:
# Runs on ClicoT and get result from "ClicoT.csv"
cluster_result = pd.read_csv('../output/Week12/ClicoT.csv', encoding='utf-8')
total_df.reset_index(drop=True, inplace=True)
total_df['cluster'] = cluster_result['cluster']

### Evaluation
* 畫出每一群的站位+球種分布
* 統計最後幾拍的站位變化

In [30]:
# plot the clustering result
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
ball_list = ['殺球', '挑球', '網前小球', '推撲球', '長球', '平球', '切球', '未擊球']
for cluster in range(0, total_df['cluster'].nunique()):
    plt.figure(figsize=(9.15, 10.05))
    cluster_total_df = total_df[total_df['cluster'] == cluster]
    ball_type_count = cluster_total_df['ball_type'].value_counts()
    ball_type_count = ball_type_count.to_dict()
    if len(cluster_total_df) > 60:
        sample_cluster_total_df = cluster_total_df.sample(n=60, random_state=1)
    else:
        sample_cluster_total_df = cluster_total_df

    plt.scatter(sample_cluster_total_df['hit_player_x'], sample_cluster_total_df['hit_player_y'], color='r', label=f'Cluster {cluster} - Hit player')
    plt.plot([sample_cluster_total_df['hit_player_x'], sample_cluster_total_df['partner_x']], [sample_cluster_total_df['hit_player_y'], 
        sample_cluster_total_df['partner_y']], color='y', label=f'Cluster {cluster} - Partner line') 
    plt.scatter(sample_cluster_total_df['partner_x'], sample_cluster_total_df['partner_y'], color='b', label=f'Cluster {cluster} - Partner')
    plt.plot([sample_cluster_total_df['hit_player_x'].mean(), sample_cluster_total_df['partner_x'].mean()], [sample_cluster_total_df['hit_player_y'].mean(), 
        sample_cluster_total_df['partner_y'].mean()], color='g', label=f'Cluster {cluster} - Mean line', linewidth=5)

    plt.xlabel('X Coordinate')
    plt.ylabel('Y Coordinate')
    plt.xlim(0, 61) 
    plt.ylim(0, 67)
    plt.savefig(f'../output/Week12/cluster_{cluster}_positions.jpg')
    plt.close()

    plt.bar(ball_list, [ball_type_count.get(ball, 0) for ball in ball_list])
    plt.xlabel('Ball Type')
    plt.ylabel('Count')
    plt.savefig(f'../output/Week12/cluster_{cluster}_ball_type.jpg')
    plt.close()

In [53]:
filter_last_hit_df = total_df[total_df['score_by_error'] == 0]
filter_last_hit_df = filter_last_hit_df.drop(filter_last_hit_df[(filter_last_hit_df['shot_id']>=27996) & (filter_last_hit_df['shot_id']<=27998)].index)
filter_last_hit_df = filter_last_hit_df.drop(filter_last_hit_df[(filter_last_hit_df['shot_id']>=25228) & (filter_last_hit_df['shot_id']<=25230)].index)
filter_last_hit_df.to_csv('../output/Week12/filter_hit.csv', index=False, encoding='utf-8')

In [45]:
win_position = []
lose_position = []
for idx in range(0, filter_last_hit_df.shape[0], 6):
    newrow_win = {
        'position': None,
        'shot_id': filter_last_hit_df.iloc[idx+1]['shot_id']
    }
    newrow_lose = {
        'position': None,
        'shot_id': int(filter_last_hit_df.iloc[idx+1]['shot_id'])
    }
    win = 'hit'+str(int(filter_last_hit_df.iloc[idx]['cluster']))+'_oppo'+str(int(filter_last_hit_df.iloc[idx+3]['cluster']))+'_hit'+str(int(filter_last_hit_df.iloc[idx+4]['cluster']))
    lose = 'oppo_'+str(int(filter_last_hit_df.iloc[idx+1]['cluster']))+'_hit'+str(int(filter_last_hit_df.iloc[idx+2]['cluster']))+'_oppo'+str(int(filter_last_hit_df.iloc[idx+5]['cluster']))
    newrow_win['position'] = win
    win_position.append(newrow_win)
    newrow_lose['position'] = lose
    lose_position.append(newrow_lose)
win_position_df = pd.DataFrame(win_position)
lose_position_df = pd.DataFrame(lose_position)


pd.set_option('display.max_rows', None)

print(win_position_df.shape[0], "unique win positions:", win_position_df['position'].nunique())
print(win_position_df['position'].value_counts())
print(lose_position_df.shape[0], "unique lose positions:", lose_position_df['position'].nunique())
print(lose_position_df['position'].value_counts())  

113 unique win positions: 72
hit6_oppo6_hit8       11
hit8_oppo8_hit8        5
hit1_oppo1_hit8        5
hit8_oppo12_hit12      4
hit10_oppo10_hit10     4
hit7_oppo12_hit12      3
hit6_oppo10_hit6       2
hit12_oppo12_hit12     2
hit0_oppo0_hit8        2
hit5_oppo5_hit4        2
hit0_oppo0_hit9        2
hit5_oppo5_hit11       2
hit10_oppo10_hit6      2
hit6_oppo6_hit6        2
hit7_oppo7_hit12       2
hit1_oppo1_hit1        2
hit4_oppo12_hit12      2
hit12_oppo12_hit10     2
hit10_oppo10_hit8      2
hit8_oppo8_hit12       2
hit0_oppo0_hit7        2
hit10_oppo6_hit0       1
hit8_oppo8_hit10       1
hit10_oppo10_hit12     1
hit9_oppo4_hit8        1
hit7_oppo8_hit9        1
hit6_oppo0_hit0        1
hit10_oppo5_hit11      1
hit6_oppo6_hit0        1
hit11_oppo8_hit12      1
hit12_oppo8_hit12      1
hit4_oppo10_hit8       1
hit6_oppo6_hit7        1
hit5_oppo5_hit5        1
hit5_oppo5_hit12       1
hit5_oppo8_hit12       1
hit6_oppo6_hit1        1
hit10_oppo4_hit4       1
hit4_oppo4_hit4      

In [54]:
win_position = []
lose_position = []

# 5 -> 0, 11 -> 9
filter_last_hit_df['cluster'] = filter_last_hit_df['cluster'].replace({5: 0, 11: 9})

for idx in range(0, filter_last_hit_df.shape[0], 6):
    newrow_win = {
        'position': None,
        'shot_id': filter_last_hit_df.iloc[idx + 1]['shot_id']
    }
    newrow_lose = {
        'position': None,
        'shot_id': int(filter_last_hit_df.iloc[idx + 1]['shot_id'])
    }
    
    win = 'hit' + str(int(filter_last_hit_df.iloc[idx]['cluster'])) + \
          '_oppo' + str(int(filter_last_hit_df.iloc[idx + 3]['cluster'])) + \
          '_hit' + str(int(filter_last_hit_df.iloc[idx + 4]['cluster']))
    
    lose = 'oppo_' + str(int(filter_last_hit_df.iloc[idx + 1]['cluster'])) + \
           '_hit' + str(int(filter_last_hit_df.iloc[idx + 2]['cluster'])) + \
           '_oppo' + str(int(filter_last_hit_df.iloc[idx + 5]['cluster']))
    
    newrow_win['position'] = win
    win_position.append(newrow_win)
    
    newrow_lose['position'] = lose
    lose_position.append(newrow_lose)

# Convert lists to DataFrames
win_position_df = pd.DataFrame(win_position)
lose_position_df = pd.DataFrame(lose_position)

# Display DataFrame settings
pd.set_option('display.max_rows', None)

# Output results
print(win_position_df.shape[0], "unique win positions:", win_position_df['position'].nunique())
print(win_position_df['position'].value_counts())
print(lose_position_df.shape[0], "unique lose positions:", lose_position_df['position'].nunique())
print(lose_position_df['position'].value_counts())


113 unique win positions: 67
hit6_oppo6_hit8       11
hit1_oppo1_hit8        5
hit8_oppo8_hit8        5
hit8_oppo12_hit12      4
hit0_oppo0_hit9        4
hit10_oppo10_hit10     4
hit0_oppo0_hit7        3
hit0_oppo0_hit12       3
hit7_oppo12_hit12      3
hit0_oppo0_hit8        2
hit7_oppo7_hit12       2
hit12_oppo12_hit12     2
hit0_oppo0_hit4        2
hit10_oppo10_hit8      2
hit6_oppo10_hit6       2
hit1_oppo1_hit1        2
hit4_oppo12_hit12      2
hit6_oppo6_hit6        2
hit0_oppo0_hit0        2
hit12_oppo12_hit10     2
hit8_oppo8_hit12       2
hit10_oppo10_hit6      2
hit7_oppo8_hit9        1
hit10_oppo6_hit0       1
hit9_oppo9_hit12       1
hit6_oppo6_hit0        1
hit4_oppo10_hit8       1
hit6_oppo0_hit0        1
hit12_oppo8_hit12      1
hit10_oppo10_hit0      1
hit10_oppo10_hit12     1
hit6_oppo6_hit7        1
hit10_oppo0_hit9       1
hit6_oppo6_hit1        1
hit10_oppo4_hit4       1
hit4_oppo4_hit4        1
hit9_oppo8_hit12       1
hit4_oppo9_hit4        1
hit8_oppo8_hit10     