In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist

# Load the dataset
data = pd.read_csv("baseball.csv")

# Select relevant columns for analysis and create a copy
features = ["RS", "RA", "W", "OBP", "SLG", "BA"]
data_filtered = data[["Year", "Team"] + features].copy()

# Ensure the features columns are cast to float64
data_filtered[features] = data_filtered[features].astype(float)

# Normalize the data
scaler = MinMaxScaler()
data_filtered.loc[:, features] = scaler.fit_transform(data_filtered[features])

# Display the first few rows of the processed dataset
print(data_filtered.head())

   Year Team        RS        RA         W       OBP       SLG      BA
0  2012  ARI  0.496337  0.342314  0.539474  0.531250  0.615789  0.5625
1  2012  ATL  0.434066  0.202853  0.710526  0.447917  0.463158  0.4125
2  2012  BAL  0.456044  0.369255  0.697368  0.354167  0.610526  0.4125
3  2012  BOS  0.496337  0.529319  0.381579  0.395833  0.600000  0.5750
4  2012  CHC  0.274725  0.454834  0.276316  0.260417  0.405263  0.3250


In [3]:
# Define the query teams using abbreviations
query_teams = {
    "2001 SEA": data_filtered[(data_filtered["Year"] == 2001) & (data_filtered["Team"] == "SEA")],
    "2004 BOS": data_filtered[(data_filtered["Year"] == 2004) & (data_filtered["Team"] == "BOS")],
    "2011 STL": data_filtered[(data_filtered["Year"] == 2011) & (data_filtered["Team"] == "STL")]
}

# Find top 10 similar teams for each query team
results = {}
for team_name, query in query_teams.items():
    if query.empty:
        print(f"Warning: No data found for {team_name}. Skipping...")
        continue

    query_features = query[features].values
    distances = cdist(query_features, data_filtered[features].values, metric="cityblock")  # Manhattan distance
    data_filtered.loc[:, "Distance"] = distances.flatten()  # Fixed assignment

    similar_teams = (
        data_filtered.sort_values("Distance")
        .head(11)  # Including the query team itself
        .loc[data_filtered["Team"] != query["Team"].values[0]]  # Exclude the query team from results
    )
    results[team_name] = similar_teams[["Year", "Team", "Distance"]].head(10)

# Print rankings
for query_team, similar_teams in results.items():
    print(f"Top 10 similar teams for {query_team}:")
    print(similar_teams)
    print("\n")

Top 10 similar teams for 2001 SEA:
     Year Team  Distance
439  1998  NYY  0.262486
432  1998  HOU  0.481409
468  1997  NYY  0.506466
153  2007  BOS  0.559188
930  1976  CIN  0.563701
319  2002  NYY  0.614564
198  2006  NYY  0.630325
409  1999  NYY  0.630716
124  2008  CHC  0.649625
108  2009  NYY  0.649731


Top 10 similar teams for 2004 BOS:
     Year Team  Distance
385  2000  SFG  0.161228
198  2006  NYY  0.176186
108  2009  NYY  0.216745
504  1996  TEX  0.254476
485  1996  CLE  0.267277
366  2000  CHW  0.307300
409  1999  NYY  0.310881
168  2007  NYY  0.311562


Top 10 similar teams for 2011 STL:
     Year Team  Distance
651  1988  MIN  0.094317
784  1983  NYY  0.134278
76   2010  MIN  0.136500
263  2004  SDP  0.139376
87   2010  TEX  0.154040
138  2008  NYY  0.164019
767  1984  TOR  0.166694
193  2006  LAA  0.169292
66   2010  CIN  0.177178


