In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

## **Column predicted: hit_type**

In [None]:
df = pd.read_csv("processed_data.csv")
X = df[["set_loc"]].astype("category")
y = df["hit_type"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

# train and predict
model = HistGradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.6438
Classification Report:
              precision    recall  f1-score   support

     blocked       0.00      0.00      0.00        55
   free_ball       0.00      0.00      0.00        13
         hit       0.64      1.00      0.78       235
   off_speed       0.00      0.00      0.00        24
    overpass       0.00      0.00      0.00         3
   roll_shot       0.00      0.00      0.00        11
         tip       0.00      0.00      0.00        24

    accuracy                           0.64       365
   macro avg       0.09      0.14      0.11       365
weighted avg       0.41      0.64      0.50       365



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## **Column Predicted: set_loc**

In [None]:
df = pd.read_csv("processed_data.csv")

X = df[["end_of_rally"]].astype("category")
y = df["set_loc"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

# train and predict
model = HistGradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.3973
Classification Report:
              precision    recall  f1-score   support

           1       0.40      1.00      0.57       145
           2       0.00      0.00      0.00        78
           3       0.00      0.00      0.00        84
           4       0.00      0.00      0.00        38
           5       0.00      0.00      0.00        20

    accuracy                           0.40       365
   macro avg       0.08      0.20      0.11       365
weighted avg       0.16      0.40      0.23       365



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## **Column: hit_type (using 10 features with the highest predictive accuracy)**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations


df = pd.read_csv("processed_data.csv")
candidate_features = ["receive_location", "hitter_location", "hit_land_location", "pass_rating", "set_type", "end_of_rally", "num_blockers", "block_touch", "Score_TeamA", "Score_TeamB"]  # Edit this list

results = []
# Try all combinations of 1 to all features
for r in range(1, len(candidate_features) + 1):
    for combo in combinations(candidate_features, r):

        X = df[list(combo)].astype("category")
        y = df["hit_type"]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        model = HistGradientBoostingClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append((combo, accuracy))

# Sort results by accuracy in descending order
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

print("\nAccuracy for all feature combinations (sorted):")
for combo, acc in sorted_results:
    print(f"{combo}: {acc:.4f}")
best_features, best_accuracy = sorted_results[0]
print("\nBest feature combination:")
print(f"{best_features} -> Accuracy: {best_accuracy:.4f}")


Accuracy for all feature combinations (sorted):
('hitter_location', 'hit_land_location', 'pass_rating', 'end_of_rally', 'num_blockers', 'block_touch'): 0.7233
('receive_location', 'hitter_location', 'hit_land_location', 'pass_rating', 'set_type', 'num_blockers', 'block_touch', 'Score_TeamB'): 0.7205
('receive_location', 'hitter_location', 'hit_land_location', 'pass_rating', 'num_blockers', 'block_touch', 'Score_TeamA', 'Score_TeamB'): 0.7178
('receive_location', 'hitter_location', 'hit_land_location', 'pass_rating', 'set_type', 'end_of_rally', 'num_blockers', 'block_touch', 'Score_TeamB'): 0.7178
('hit_land_location', 'set_type', 'block_touch'): 0.7123
('receive_location', 'hitter_location', 'hit_land_location', 'pass_rating', 'num_blockers', 'block_touch', 'Score_TeamA'): 0.7123
('receive_location', 'hitter_location', 'hit_land_location', 'end_of_rally', 'num_blockers', 'block_touch', 'Score_TeamA'): 0.7123
('hitter_location', 'hit_land_location', 'pass_rating', 'set_type', 'end_of_r

Column: set_loc (using 10 features with the highest predictive accuracy)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations


df = pd.read_csv("processed_data.csv")
candidate_features = ["hitter_location", "num_blockers", "block_touch", "receive_location", "hit_type", "streak_B", "hit_land_location", "pass_rating", "points_to_win_A", "points_to_win_B"]  # Edit this list

results = []
# Try all combinations of 1 to all features
for r in range(1, len(candidate_features) + 1):
    for combo in combinations(candidate_features, r):

        X = df[list(combo)].astype("category")
        y = df["set_loc"]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        model = HistGradientBoostingClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append((combo, accuracy))

# Sort results by accuracy in descending order
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)


print("\nAccuracy for all feature combinations (sorted):")
for combo, acc in sorted_results:
    print(f"{combo}: {acc:.4f}")
best_features, best_accuracy = sorted_results[0]
print("\nBest feature combination:")
print(f"{best_features} -> Accuracy: {best_accuracy:.4f}")


Accuracy for all feature combinations (sorted):
('hitter_location', 'num_blockers', 'block_touch', 'streak_B', 'hit_land_location', 'pass_rating'): 0.9123
('hitter_location', 'num_blockers', 'hit_type', 'streak_B', 'pass_rating'): 0.9041
('hitter_location', 'num_blockers', 'hit_land_location', 'pass_rating'): 0.9014
('hitter_location', 'num_blockers', 'block_touch', 'hit_type', 'pass_rating'): 0.9014
('hitter_location', 'num_blockers', 'block_touch', 'streak_B', 'pass_rating'): 0.9014
('hitter_location', 'num_blockers', 'block_touch', 'receive_location', 'hit_type', 'hit_land_location', 'pass_rating'): 0.9014
('hitter_location', 'pass_rating'): 0.8986
('hitter_location', 'num_blockers', 'pass_rating'): 0.8986
('hitter_location', 'num_blockers', 'block_touch', 'pass_rating'): 0.8986
('hitter_location', 'num_blockers', 'receive_location', 'pass_rating'): 0.8986
('hitter_location', 'num_blockers', 'hit_type', 'pass_rating'): 0.8986
('hitter_location', 'num_blockers', 'block_touch', 'rece