In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from joblib import dump, load  # Import for saving and loading models

# Load the new dataset
df_new = pd.read_csv('cleaned.csv')

# Define features (X) and target (Y)
X = df_new[['Team', 'Game Number', 'Opponent Team', 'Arena Capacity']]
Y = df_new['Percentage']

# One-hot encode categorical columns: 'Team' and 'Opponent Team'
categorical_features = ['Team', 'Opponent Team']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_features)],
    remainder='passthrough'
)

# Preprocess features
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.3, random_state=7)

# Train the DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=7)
regressor.fit(X_train, Y_train)

# Save the trained model to a file
dump(regressor, 'model.joblib')
print("Model saved as 'model.joblib'")

# Make predictions
train_pred = regressor.predict(X_train)
test_pred = regressor.predict(X_test)

# Define the margin of error
margin_of_error = 0.05

# Calculate accuracies within the 5% margin of error
train_accuracy = (abs(train_pred - Y_train) <= margin_of_error).mean()
test_accuracy = (abs(test_pred - Y_test) <= margin_of_error).mean()

print(f"Training accuracy: {train_accuracy}")
print(f"Testing accuracy: {test_accuracy}")

Model saved as 'model.joblib'
Training accuracy: 1.0
Testing accuracy: 0.8211382113821138
