In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# set traditional Chinese display
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei']

In [None]:
# read data that has been filtered in Week2.ipynb
df = pd.read_csv("./filtered_shot.csv")

# Check if player A and hit are on the same side
same_sign = (df['player_A_x'] - 67) * (df['hit_x'] - 67) > 0

# create a new dataframe to store the processed data
new_df = pd.DataFrame()

for index, row in df.iterrows():
    if same_sign[index]:
        # leave player_A and player_B data
        new_row = {
            'player1_x': row['player_A_x'],
            'player1_y': row['player_A_y'],
            'player2_x': row['player_B_x'],
            'player2_y': row['player_B_y'],
            'ball_type': row['ball_type']
        }
    else:
        # leave player_C and player_D data
        new_row = {
            'player1_x': row['player_C_x'],
            'player1_y': row['player_C_y'],
            'player2_x': row['player_D_x'],
            'player2_y': row['player_D_y'],
            'ball_type': row['ball_type']
        }
    
    # check if the player is on the same side as the hit
    for player in ['player1', 'player2']:
        player_x = new_row[f'{player}_x']
        player_y = new_row[f'{player}_y']
        
        if player_y > 67:
            # convert to the other side
            new_row[f'{player}_x'] = 61 - player_x
            new_row[f'{player}_y'] = 134 - player_y
    
    # check hit player
    if row['player'] == 33:
        new_row = {
            'player1_x': new_row['player2_x'],
            'player1_y': new_row['player2_y'],
            'player2_x': new_row['player1_x'],
            'player2_y': new_row['player1_y'],
            'ball_type': new_row['ball_type']
        }
    
    # check ball type
    if (new_row['ball_type'] == '擋小球') + (new_row['ball_type'] == '勾球') + (new_row['ball_type'] == '放小球') + (new_row['ball_type'] == '小平球') > 0:
        new_row['ball_type'] = '網前小球'
    elif (new_row['ball_type'] == '防守回挑'):
        new_row['ball_type'] = '挑球'
    elif (new_row['ball_type'] == '防守回抽') + (new_row['ball_type'] == '後場抽平球') > 0:
        new_row['ball_type'] = '平球'
    elif (new_row['ball_type'] == '過度切球'):
        new_row['ball_type'] = '切球'
    elif (new_row['ball_type'] == '推球') + (new_row['ball_type'] == '撲球') > 0:
        new_row['ball_type'] = '推撲球'
    
    # write the processed data to the new dataframe
    new_df = new_df.append(new_row, ignore_index=True)

# save the processed data
new_df.to_csv('processed_shot.csv', index=False)

In [None]:
data = pd.read_csv('./processed_shot.csv')

# selsct features
features = data[['player1_x', 'player1_y', 'player2_x', 'player2_y']]

# standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# use elbow method to find the optimal k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_scaled)
    sse.append(kmeans.inertia_)

# plot the elbow method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow Method For Optimal k')
plt.savefig('elbow_plot.png')
plt.show()

In [None]:
# clustering
n = 3  # Best n from elbow method
kmeans = KMeans(n_clusters=n)
data['cluster'] = kmeans.fit_predict(features)

In [None]:
# plot the clustering result
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

for cluster in range(n):
    plt.figure(figsize=(9.15, 10.05))
    cluster_data = data[data['cluster'] == cluster]
    plt.scatter(cluster_data['player1_x'], cluster_data['player1_y'], color='r', label=f'Cluster {cluster} - Hit player')
    plt.plot([cluster_data['player1_x'], cluster_data['player2_x']], [cluster_data['player1_y'], cluster_data['player2_y']], color='g', label=f'Cluster {cluster} - Partner line')  
    plt.scatter(cluster_data['player2_x'], cluster_data['player2_y'], color='b', label=f'Cluster {cluster} - Partner')  
    plt.xlabel('X Coordinate')
    plt.ylabel('Y Coordinate')
    plt.xlim(0, 61) 
    plt.ylim(0, 67) 
    plt.savefig(f'cluster_{cluster}_positions.jpg')
    plt.close()

In [None]:
# plot all clustering results in one figure
plt.figure(figsize=(9.15, 10.05))
for cluster in range(n):
    cluster_data = data[data['cluster'] == cluster]
    plt.scatter(cluster_data['player1_x'], cluster_data['player1_y'], color=colors[cluster % len(colors)], label=f'Cluster {cluster} - Hit player')
    plt.scatter(cluster_data['player2_x'], cluster_data['player2_y'], color=colors[cluster % len(colors)], marker='x', label=f'Cluster {cluster} - Partner')
plt.xlabel('X Coordinate')
plt.ylabel('Y Coordinate')
plt.xlim(0, 61) 
plt.ylim(0, 67)
plt.legend()
plt.savefig(f'clusters_positions.jpg')
plt.close()

In [None]:
# count the number of each ball type in each cluster
cluster_ball_type_counts = data.groupby(['cluster', 'ball_type']).size().unstack(fill_value=0)

# save the clustering result and the ball type counts
data.to_csv('clustered_shot.csv', index=False, encoding='utf-8')
cluster_ball_type_counts.to_csv('cluster_ball_type_counts.csv', encoding='utf-8')

# show the clustering result and the ball type counts
print("Cluster ball type counts sorted by frequency:")
sorted_cluster_ball_type_counts = cluster_ball_type_counts.apply(lambda x: x.sort_values(ascending=False), axis=1)
print(sorted_cluster_ball_type_counts)

# plot the ball type distribution for each cluster and save the figures
for cluster in range(n):
    try:
        plt.figure(figsize=(10, 6))
        sorted_counts = sorted_cluster_ball_type_counts.loc[cluster]
        sorted_counts.plot(kind='bar', color='skyblue')
        plt.title('Ball Type Distribution')
        plt.xlabel('Ball Type')
        plt.ylabel('Frequency')
        plt.ylim(0, 850)  # 設置 y 軸範圍

        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'cluster_{cluster}_ball_type_distribution.png')
        plt.close()
    except:
        continue