In [42]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler


# Paths for datasets
datasets_folder = 'Datasets'
combined_dataset_filename = 'long_vid_entire_dataset.csv'
combined_dataset_path = os.path.join(datasets_folder, combined_dataset_filename)

# Step 1: Load the combined dataset
try:
    combined_df = pd.read_csv(combined_dataset_path)
    print(f"Successfully loaded dataset: {combined_dataset_path}")
except FileNotFoundError:
    raise FileNotFoundError(f"Combined dataset not found at: {combined_dataset_path}")
except pd.errors.EmptyDataError:
    print("Error: The file is empty.")
    exit()
except pd.errors.ParserError:
    print("Error: The file could not be parsed.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    exit()

# Step 2: Verify required columns exist
required_columns = ['shot_id', 'is_shot', 'frame']
missing_columns = [col for col in required_columns if col not in combined_df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns in the dataset: {missing_columns}")
else:
    print("All required columns are present.")

df = combined_df.copy()

Successfully loaded dataset: Datasets\long_vid_entire_dataset.csv
All required columns are present.


In [43]:
### Split ball_position column

# Ensure 'sports_ball_positions' column exists
if 'sports_ball_positions' in df.columns:
    # Split the 'sports_ball_positions' column into two new columns
    df[['ball_pos_x', 'ball_pos_y']] = df['sports_ball_positions'].str.split(',', expand=True)

    # Convert the new columns to float type for numerical operations
    df['ball_pos_x'] = pd.to_numeric(df['ball_pos_x'], errors='coerce')
    df['ball_pos_y'] = pd.to_numeric(df['ball_pos_y'], errors='coerce')

    # Drop the original 'sports_ball_positions' column
    df.drop(columns=['sports_ball_positions'], inplace=True)


print("Split the ball_postiion column into two -> ball_pos_x and ball_pos_y")

Split the ball_postiion column into two -> ball_pos_x and ball_pos_y


In [44]:
### INTERPOLATE VALUES

def optimized_interpolate_zeros(data, columns):
    
    df_interpolated = data.copy()
    df_interpolated[columns] = df_interpolated[columns].replace(0, np.nan)
    df_interpolated[columns] = df_interpolated[columns].interpolate(method='linear', limit_direction='both')
    return df_interpolated

# Apply the function to all columns in the DataFrame except specified columns
columns_to_exclude = ['frame', 'video', 'is_shot', 'shot_id', 'shot_invalid', 'make']
columns_to_interpolate = [col for col in df.columns if col not in columns_to_exclude]

# Check if columns_to_interpolate is not empty
if not columns_to_interpolate:
    print("No columns available for interpolation after excluding specified columns.")
    df = df.copy()
else:
    df = optimized_interpolate_zeros(df, columns_to_interpolate)
    print("Interpolation completed successfully.")


Interpolation completed successfully.


In [45]:
### Compute Start and End frames of shot motions


#Step 3: Identify all (start_frame, end_frame) pairs and compute shot_id_length for each
sequence_info = []  # List to store tuples of (shot_id, start_frame, end_frame, shot_id_length)

unique_shot_ids = df['shot_id'].unique()

for shot_id in unique_shot_ids:
    # Extract rows for the current shot_id
    shot_df = df[df['shot_id'] == shot_id].copy().reset_index(drop=True)

    # Identify the start and end indices of each 'is_shot' == True sequence
    shot_df['is_shot_shift'] = shot_df['is_shot'].shift(1, fill_value=0)
    shot_df['is_shot_next'] = shot_df['is_shot'].shift(-1, fill_value=0)

    # A sequence starts where 'is_shot' changes from 0 to 1
    start_indices = shot_df[(shot_df['is_shot'] == 1) & (shot_df['is_shot_shift'] == 0)].index.tolist()
    # A sequence ends where 'is_shot' changes from 1 to 0
    end_indices = shot_df[(shot_df['is_shot'] == 1) & (shot_df['is_shot_next'] == 0)].index.tolist()

    # Ensure that every start has a corresponding end
    if len(start_indices) != len(end_indices):
        # Handle cases where a sequence starts but doesn't end
        if len(start_indices) > len(end_indices):
            end_indices.append(len(shot_df) - 1)

    # Iterate through each sequence
    for start_idx, end_idx in zip(start_indices, end_indices):
        # Get the actual frame numbers
        start_frame = shot_df.loc[start_idx, 'frame']
        end_frame = shot_df.loc[end_idx, 'frame']

        # Calculate shot_id_length
        shot_id_length = end_frame - start_frame

        # Append the information
        sequence_info.append((shot_id, start_frame, end_frame, shot_id_length))

# Step 4: Determine max_length as the maximum shot_id_length across all sequences
shot_id_lengths = [info[3] for info in sequence_info]
print(shot_id_lengths)

# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(shot_id_lengths, 25)
Q3 = np.percentile(shot_id_lengths, 75)

# Calculate IQR
IQR = Q3 - Q1

# Determine bounds for non-outlier lengths
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter shot_id_lengths to exclude outliers
filtered_lengths = [length for length in shot_id_lengths if lower_bound <= length <= upper_bound]

# Calculate max_length as the maximum of the filtered lengths
max_length = max(filtered_lengths) if filtered_lengths else 0
# Step 5: Calculate the length of each subdataset based on the criteria
# Create a DataFrame to store the results
subdataset_info = []  # List to store tuples of (shot_id, adjusted_start_frame, end_frame, subdataset_length)

for (shot_id, start_frame, end_frame, shot_id_length) in sequence_info:
    # Calculate the adjusted start frame
    adjusted_start_frame = end_frame - max_length
    if adjusted_start_frame < 0:
        adjusted_start_frame = 0  # Adjust based on your frame numbering (0 or 1)

    # Calculate the length of the subdataset
    subdataset_length = end_frame - adjusted_start_frame + 1  # +1 if inclusive

    # Append the information
    subdataset_info.append((shot_id, adjusted_start_frame, end_frame, subdataset_length))


# Convert the list to a DataFrame for better visualization and analysis
subdataset_df = pd.DataFrame(subdataset_info, columns=['shot_id', 'adjusted_start_frame', 'end_frame', 'frames'])


# Optional: Display first few rows of the subdataset_info
print("\nSample Subdataset Information of First 30 Motions:")
#print(subdataset_df.head(30))



[110, 18, 40, 47, 45, 26, 40, 34, 32, 36, 34, 33, 35, 85, 39, 15, 35, 52, 37, 39, 41, 35, 38, 37, 41, 37]

Sample Subdataset Information of First 30 Motions:


In [46]:
### Create subdatasets

subdatasets = []
for row in subdataset_df.itertuples(index=False):
    start_frame = row.adjusted_start_frame
    end_frame = row.end_frame
    shot_id = row.shot_id

    sub_df = df[
        (df['frame'] >= start_frame) &  # Corrected variable name
        (df['frame'] <= end_frame)
    ].copy()

    # Append the subset to subdatasets list
    subdatasets.append(sub_df)

for df in subdatasets:
    #Drop Un-needed columns
    df.drop(columns=['video', 'is_shot', 'shot_invalid'], inplace=True)


print("Created Datasets of Each Shot Motion and Dropped Un-needed Columns")

Created Datasets of Each Shot Motion and Dropped Un-needed Columns


In [47]:
made = []
missed = []

for df in standardized_dfs:
    if (df["make"] == True).any():  # If at least one row is True
        # Make all values in the "make" column set to True
        df["make"] = 1
        
        # Find the maximum value in the "shot_id" column
        max_shot_id = df["shot_id"].max()

        # Set all values in the "shot_id" column to the maximum value
        df["shot_id"] = max_shot_id

        # Append the modified DataFrame to the 'made' list
        made.append(df)
        
    elif (df["make"] == False).any():  # If at least one row is False
        # Make all values in the "make" column set to False
        df["make"] = 0

        # Find the maximum value in the "shot_id" column
        max_shot_id = df["shot_id"].max()

        # Set all values in the "shot_id" column to the maximum value
        df["shot_id"] = max_shot_id

        # Append the modified DataFrame to the 'missed' list
        missed.append(df)

Now, there are two lists:

Made list: All shot motions with Good form.

Missed list: All shot motions with Bad form.

Goal: Combine these two lists into a single big dataframe: "data_df"

First, lets visualize:

In [48]:
show_plots = False

if show_plots:
    # -----------------------------------------------
    # Step 1: Prepare Labels and Colors
    # -----------------------------------------------
    
    # Generate labels for each DataFrame in 'made' and 'missed'
    made_labels = [f'Made {i+1}' for i in range(len(made))]
    missed_labels = [f'Missed {i+1}' for i in range(len(missed))]
    
    # Assign colors: green for 'made' and red for 'missed'
    made_colors = ['green'] * len(made)
    missed_colors = ['red'] * len(missed)
    
    # Combine the lists for easier processing
    all_dfs = made + missed
    all_labels = made_labels + missed_labels
    all_colors = made_colors + missed_colors
    
    # -----------------------------------------------
    # Step 2: Plotting All DataFrames
    # -----------------------------------------------
    
    # Check if there are any DataFrames to plot
    if not all_dfs:
        raise ValueError("Both 'made' and 'missed' lists are empty. Please check your data processing steps.")
    
    # Select only numeric columns from the first DataFrame (assuming all have the same columns)
    numeric_columns = all_dfs[0].select_dtypes(include=['number']).columns
    
    # Loop through each numeric feature and plot for all datasets
    for feature in numeric_columns:
        plt.figure(figsize=(24, 12))
        
        for df, label, color in zip(all_dfs, all_labels, all_colors):
            x = df.index
            y = df[feature]
            plt.plot(
                x, y,
                marker='o',
                linestyle='--',
                color=color,
                label=label
            )
        
        # Set the title, labels, and grid
        plt.title(f'{feature} Over Index', fontsize=16)
        plt.xlabel('Frame', fontsize=14)
        plt.ylabel(feature, fontsize=14)
        plt.grid(True, which='both', linestyle='--', linewidth=0.5)
        
        # Handle the legend to avoid duplicate labels
        handles, labels_ = plt.gca().get_legend_handles_labels()
        unique_labels = {}
        for handle, label in zip(handles, labels_):
            if label not in unique_labels:
                unique_labels[label] = handle
        plt.legend(unique_labels.values(), unique_labels.keys(), title='Datasets', fontsize=12, title_fontsize=12, loc='best')
        
        # Optimize layout
        plt.tight_layout()
        
        # Show the plot
        plt.show()

In [49]:
# Combine both lists into a single list
combined_list = made + missed

data_df = pd.concat(combined_list, ignore_index=True, sort=False)

In [50]:
# Set random seed for reproducibility
np.random.seed(18)

# Step 0: Sort data_df by 'shot_id' and 'frame' to ensure frame order within each shot_id
data_df = data_df.sort_values(by=['shot_id', 'frame']).reset_index(drop=True)

# Step 1: Identify Unique shot_ids
unique_shot_ids = data_df['shot_id'].unique()

# Step 2: Shuffle the list of unique shot_ids
shuffled_shot_ids = np.random.permutation(unique_shot_ids)

# Step 3: Create a list of DataFrames ordered by shuffled shot_ids
shuffled_dfs = [data_df[data_df['shot_id'] == shot_id] for shot_id in shuffled_shot_ids]

# Step 4: Concatenate the shuffled DataFrames into one big DataFrame
shuffled_data_df = pd.concat(shuffled_dfs, ignore_index=True)

# Replace the original data_df with shuffled_data_df
data_df = shuffled_data_df

Now, we have a big shuffled dataset, each shot motion is grouped together and in a random order based on the shot id.

In [51]:
# Extract unique shot_ids in the order they appear in data_df
unique_shot_ids = data_df['shot_id'].unique()
print("Unique shot_ids in shuffled order:", unique_shot_ids)

# Define split ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Total number of unique shot_ids
total_shots = len(unique_shot_ids)

# Calculate the number of shot_ids for each split
train_end = int(train_ratio * total_shots)
validation_end = train_end + int(validation_ratio * total_shots)

print(f"\nTotal shot_ids: {total_shots}")
print(f"Training shot_ids: {train_end}")
print(f"Validation shot_ids: {validation_end - train_end}")
print(f"Testing shot_ids: {total_shots - validation_end}")


# Assign shot_ids to each split
train_shot_ids = unique_shot_ids[:train_end]
validation_shot_ids = unique_shot_ids[train_end:validation_end]
test_shot_ids = unique_shot_ids[validation_end:]

print("\nAssigned shot_ids:")
print(f"Training shot_ids: {train_shot_ids}")
print(f"Validation shot_ids: {validation_shot_ids}")
print(f"Testing shot_ids: {test_shot_ids}")



Unique shot_ids in shuffled order: [23  1 10 13  7  2  4 21 12  8 16  5 14 26 22 17 24 25  9  3 19 15 18  6
 20 11]

Total shot_ids: 26
Training shot_ids: 18
Validation shot_ids: 3
Testing shot_ids: 5

Assigned shot_ids:
Training shot_ids: [23  1 10 13  7  2  4 21 12  8 16  5 14 26 22 17 24 25]
Validation shot_ids: [ 9  3 19]
Testing shot_ids: [15 18  6 20 11]


In [52]:
# Create boolean masks for each split
train_mask = data_df['shot_id'].isin(train_shot_ids)
validation_mask = data_df['shot_id'].isin(validation_shot_ids)
test_mask = data_df['shot_id'].isin(test_shot_ids)

# Create the splits
train_df = data_df[train_mask].reset_index(drop=True)
validation_df = data_df[validation_mask].reset_index(drop=True)
test_df = data_df[test_mask].reset_index(drop=True)

# Display the number of shot_ids and rows in each split
print("\nSplit Sizes:")
print(f"Training set: {train_df['shot_id'].nunique()} shot_ids, {len(train_df)} rows")
print(f"Validation set: {validation_df['shot_id'].nunique()} shot_ids, {len(validation_df)} rows")
print(f"Testing set: {test_df['shot_id'].nunique()} shot_ids, {len(test_df)} rows")



Split Sizes:
Training set: 18 shot_ids, 864 rows
Validation set: 3 shot_ids, 144 rows
Testing set: 5 shot_ids, 240 rows


Now that I have Train, Test, and Valid Sets, I need to normalize the data.

In [53]:
# After splitting the data
# Assuming train_df, validation_df, and test_df have been created

columns_to_exclude = ['frame', 'make', 'shot_id']
numeric_columns = train_df.select_dtypes(include = ["float64", "int"]).columns.tolist()
columns_to_scale = [col for col in numeric_columns if col not in columns_to_exclude]


# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(train_df[columns_to_scale])

# Transform the training, validation, and test sets
train_df[columns_to_scale] = scaler.transform(train_df[columns_to_scale])
validation_df[columns_to_scale] = scaler.transform(validation_df[columns_to_scale])
test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

print("Normalization completed after splitting the data.")


Normalization completed after splitting the data.


Now, I have three dataframes: train, test, and valid. I would like to convert these into three separate .npy files. Each with the shape of (Shot Motion, Frame, Features).

This allow me to input these files into LSTM_Model.py and start training the model. 

In [54]:
def dataframe_to_tensor(df, feature_columns):
    grouped = df.groupby('shot_id')
    data_list = []
    labels_list = []
    
    for shot_id, group in grouped:
        # Sort the group by 'frame' to maintain sequential order
        sorted_group = group.sort_values('frame')
        
        # Extract feature values
        features = sorted_group[feature_columns].values  # Shape: (num_frames, num_features)
        
        # Extract label (assumes 'make' is consistent within a shot)
        label = sorted_group['make'].iloc[0]
        
        data_list.append(features)
        labels_list.append(label)
    
    # Convert lists to NumPy arrays
    data_tensor = np.array(data_list)      # Shape: (num_shots, num_frames, num_features)
    labels = np.array(labels_list)        # Shape: (num_shots,)
    
    return data_tensor, labels

In [55]:
# Define columns to exclude
columns_to_exclude = ['frame', 'make', 'shot_id']

# Assuming all DataFrames have the same columns, use one to determine feature columns
feature_columns = [col for col in train_df.columns if col not in columns_to_exclude]

output_folder = 'Datasets'  # Ensure this folder exists

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Convert training data
train_data_tensor, train_labels = dataframe_to_tensor(train_df, feature_columns)
print(f"Training data tensor shape: {train_data_tensor.shape}")
print(f"Training labels shape: {train_labels.shape}")

# Convert validation data
validation_data_tensor, validation_labels = dataframe_to_tensor(validation_df, feature_columns)
print(f"Validation data tensor shape: {validation_data_tensor.shape}")
print(f"Validation labels shape: {validation_labels.shape}")

# Convert testing data
test_data_tensor, test_labels = dataframe_to_tensor(test_df, feature_columns)
print(f"Testing data tensor shape: {test_data_tensor.shape}")
print(f"Testing labels shape: {test_labels.shape}")


Training data tensor shape: (18, 48, 40)
Training labels shape: (18,)
Validation data tensor shape: (3, 48, 40)
Validation labels shape: (3,)
Testing data tensor shape: (5, 48, 40)
Testing labels shape: (5,)


In [62]:
print(test_data_tensor[0])
print(test_data_tensor[1])
print(test_labels)

[[-9.87616941e-01 -1.14171804e+00 -1.63398057e-01 ... -8.85143742e-01
  -8.16187438e-01  1.56746645e+00]
 [-9.71618105e-01 -1.04110756e+00  3.95185975e-04 ... -8.93214041e-01
  -7.85031073e-01  1.55656012e+00]
 [-9.05337215e-01 -9.29730653e-01  8.46577581e-02 ... -9.10285827e-01
  -7.53874708e-01  1.54565379e+00]
 ...
 [-5.31078741e-01 -4.14426811e-01 -1.70468415e-01 ...  1.15865948e+00
  -7.07295013e-01 -1.23358511e+00]
 [-6.95885198e-02  3.07790581e-01  7.46475650e-01 ...  1.15089958e+00
  -3.93625365e-01 -1.45194634e+00]
 [ 8.54478152e-02  5.80587957e-02  2.33193152e-01 ...  1.14981319e+00
   6.72162985e-02 -1.70179522e+00]]
[[-0.82077194 -0.89656508  0.06174494 ... -0.90656107 -0.41790627
   1.35589481]
 [-0.6837343  -0.70425428  0.20310289 ... -0.9124586  -0.40427923
   1.35067863]
 [-0.57840863 -0.55587549  0.14999273 ... -0.92471925 -0.39065219
   1.34546245]
 ...
 [-0.60145457 -0.00567355 -0.20752195 ...  1.1571075  -0.54575267
  -1.22215105]
 [-0.16472445  0.80774916  0.704953

In [65]:
def save_tensor_and_labels(data_tensor, labels, dataset_type, output_dir='Datasets'):
    """
    Saves the data tensor and labels as .npy files.
    
    Parameters:
    - data_tensor (np.ndarray): 3D data tensor.
    - labels (np.ndarray): 1D labels array.
    - dataset_type (str): Type of dataset ('train', 'validation', 'test').
    - output_dir (str): Directory to save the .npy files.
    """
    data_path = os.path.join(output_dir, f"{dataset_type}_data.npy")
    labels_path = os.path.join(output_dir, f"{dataset_type}_labels.npy")
    
    np.save(data_path, data_tensor)
    np.save(labels_path, labels)
    
    print(f"Saved {dataset_type} data to {data_path}")
    print(f"Saved {dataset_type} labels to {labels_path}")

# Save all datasets
save_tensor_and_labels(train_data_tensor, train_labels, 'train', output_dir=output_folder)
save_tensor_and_labels(validation_data_tensor, validation_labels, 'validation', output_dir=output_folder)
save_tensor_and_labels(test_data_tensor, test_labels, 'test', output_dir=output_folder)


Saved train data to Datasets\train_data.npy
Saved train labels to Datasets\train_labels.npy
Saved validation data to Datasets\validation_data.npy
Saved validation labels to Datasets\validation_labels.npy
Saved test data to Datasets\test_data.npy
Saved test labels to Datasets\test_labels.npy
