# Rugby Workout Recommender Prototype

Objective: To recommend list of suitable workouts for rugby players based on performance goal.

In [95]:
import pandas as pd
import numpy as np
import re #Regular expression matching operations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

## 1) Load CSVs

In [96]:
exercises = pd.read_csv("exercises.csv")
goals = pd.read_csv("goals.csv")

In [97]:
#Preview
print("Exercises: ", exercises.shape)
display(exercises.head())

Exercises:  (30, 9)


Unnamed: 0,exercise_id,exercise_name,type,target_muscles,primary_attributes,intensity_level,equipment_needed,category,description
0,1,Front Squat,Weight,"Legs, Core",Strength,9,Barbell,Power,Builds leg drive and core stability crucial fo...
1,2,Back Squat,Weight,"Legs, Core",Strength,9,Barbell,Power,Enhances lower-body strength and explosiveness...
2,3,Deadlift,Weight,"Legs, Back, Core",Strength,9,Barbell,Power,Develops full-body pulling power and posterior...
3,4,Sled Push,Weight,"Legs, Core, Shoulders",Explosiveness,8,Sled,Power,Simulates scrum engagement and improves drivin...
4,5,Power Clean,Weight,"Legs, Core, Shoulders",Explosiveness,8,Barbell,Power,Builds total-body explosive power for tackles ...


In [98]:
print("Goals", goals.shape)
display(goals.head())

Goals (12, 5)


Unnamed: 0,goal_id,goal_name,focus_muscles,target_attributes,description
0,1,Scrum Power,"Legs, Core, Shoulders","Strength, Power, Stability","Improves pushing force, body stability, and lo..."
1,2,Tackling Power,"Legs, Shoulders, Core","Strength, Explosiveness, Power","Enhances total-body power, shoulder drive, and..."
2,3,Sprint Speed,"Legs, Core","Explosiveness, Speed, Power",Boosts acceleration and sprinting ability for ...
3,4,Agility & Footwork,"Legs, Core","Agility, Coordination, Speed","Improves lateral movement, quickness, and reac..."
4,5,Endurance & Conditioning,Full Body,"Endurance, Stamina, Cardiovascular",Increases ability to sustain high performance ...


## 2) Clearning & Normalization

In [99]:
#Function for normalize textual columns
def normalize_text(s):
    if pd.isna(s):
        return ""
    s = str(s).lower()
    
    #Keep commas
    s = re.sub(r"[^\w\s,]"," ",s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [100]:
text_cols_ex = ["exercise_name", "target_muscles", "primary_attributes", "category", "description"]
text_cols_goal = ["goal_name", "focus_muscles", "target_attributes", "description"]

In [101]:
for c in text_cols_ex:
    exercises[c] = exercises[c].astype(str).apply(normalize_text)

In [102]:
for c in text_cols_goal:
    goals[c] = goals[c].astype(str).apply(normalize_text)

## 3) Handle missing values & duplicates

In [103]:
exercises = exercises.drop_duplicates(subset=["exercise_name"]).reset_index(drop=True)
goals = goals.drop_duplicates(subset=["goal_name"]).reset_index(drop=True)

# If numeric columns have missing values, fill or drop
if "intensity_level" in exercises.columns:
    exercises["intensity_level"] = pd.to_numeric(exercises["intensity_level"], errors="coerce")
    # fill missing intensity with median
    exercises["intensity_level"] = exercises["intensity_level"].fillna(int(exercises["intensity_level"].median()))

## 4) Convert comma-separated lists into Python lists

In [104]:
def csv_to_list(s):
    if not s or pd.isna(s):
        return []
    # split by comma and strip spaces
    return [t.strip() for t in s.split(",") if t.strip()]

In [105]:
# apply to exercises
exercises["target_muscles_list"] = exercises["target_muscles"].apply(csv_to_list)
exercises["primary_attributes_list"] = exercises["primary_attributes"].apply(csv_to_list)

In [106]:
# apply to goals
goals["focus_muscles_list"] = goals["focus_muscles"].apply(csv_to_list)
goals["target_attributes_list"] = goals["target_attributes"].apply(csv_to_list)

## 5) Multi-hot encode muscles and attributes

In [107]:
mlb_muscles = MultiLabelBinarizer()
mlb_attrs = MultiLabelBinarizer()

In [108]:
# Fit on exercises (you want consistent columns between exercises and goals)
all_muscles = exercises["target_muscles_list"].tolist() + goals["focus_muscles_list"].tolist()
all_attrs = exercises["primary_attributes_list"].tolist() + goals["target_attributes_list"].tolist()

In [109]:
mlb_muscles.fit(all_muscles)
mlb_attrs.fit(all_attrs)

In [110]:
# transform exercises
muscle_matrix_ex = pd.DataFrame(mlb_muscles.transform(exercises["target_muscles_list"]),
                                columns=[f"m_{c}" for c in mlb_muscles.classes_])
attr_matrix_ex = pd.DataFrame(mlb_attrs.transform(exercises["primary_attributes_list"]),
                              columns=[f"a_{c}" for c in mlb_attrs.classes_])

In [111]:
exercises = pd.concat([exercises.reset_index(drop=True), muscle_matrix_ex, attr_matrix_ex], axis=1)


In [112]:
# transform goals (into same column space)
muscle_matrix_goal = pd.DataFrame(mlb_muscles.transform(goals["focus_muscles_list"]),
                                  columns=[f"m_{c}" for c in mlb_muscles.classes_])
attr_matrix_goal = pd.DataFrame(mlb_attrs.transform(goals["target_attributes_list"]),
                                columns=[f"a_{c}" for c in mlb_attrs.classes_])

goals = pd.concat([goals.reset_index(drop=True), muscle_matrix_goal, attr_matrix_goal], axis=1)

## 6) Numeric feature scaling

In [113]:
if "intensity_level" in exercises.columns:
    scaler = MinMaxScaler()
    exercises["intensity_norm"] = scaler.fit_transform(exercises[["intensity_level"]])

## 7) Create a combined text feature (for TF-IDF)

In [114]:
exercises["features_text"] = (
    exercises["target_muscles"].fillna("") + " "
    + exercises["primary_attributes"].fillna("") + " "
    + exercises["category"].fillna("") + " "
    + exercises["description"].fillna("")
)

goals["features_text"] = (
    goals["focus_muscles"].fillna("") + " "
    + goals["target_attributes"].fillna("") + " "
    + goals["description"].fillna("")
)

# quick check
display(exercises[["exercise_name", "features_text"]].head())
display(goals[["goal_name", "features_text"]].head())

Unnamed: 0,exercise_name,features_text
0,front squat,"legs, core strength power builds leg drive and..."
1,back squat,"legs, core strength power enhances lower body ..."
2,deadlift,"legs, back, core strength power develops full ..."
3,sled push,"legs, core, shoulders explosiveness power simu..."
4,power clean,"legs, core, shoulders explosiveness power buil..."


Unnamed: 0,goal_name,features_text
0,scrum power,"legs, core, shoulders strength, power, stabili..."
1,tackling power,"legs, shoulders, core strength, explosiveness,..."
2,sprint speed,"legs, core explosiveness, speed, power boosts ..."
3,agility footwork,"legs, core agility, coordination, speed improv..."
4,endurance conditioning,"full body endurance, stamina, cardiovascular i..."


## 8) Vectorize text (TF-IDF)

In [115]:
# Step 8: vectorize using TF-IDF (unigrams + bigrams usually good)
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english", min_df=1)

exercise_tfidf = vectorizer.fit_transform(exercises["features_text"])
goal_tfidf = vectorizer.transform(goals["features_text"])


In [116]:
import joblib

In [117]:
# Save vectorizer for reuse
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
# You may also want to save the sparse matrices (joblib can handle them)
joblib.dump(exercise_tfidf, "exercise_tfidf.joblib")

['exercise_tfidf.joblib']

In [118]:
from scipy.sparse import hstack, csr_matrix

# pick structured columns (muscle + attr columns we made)
struct_cols = [c for c in exercises.columns if c.startswith("m_") or c.startswith("a_")]


for col in struct_cols:
    if col not in goals.columns:
        goals[col] = 0

# build structured numeric arrays
exercise_struct = csr_matrix(exercises[struct_cols].values)   # sparse matrix
goal_struct = csr_matrix(goals[struct_cols].values)

# combine TF-IDF with structured features horizontally
exercise_matrix = hstack([exercise_tfidf, exercise_struct])
goal_matrix = hstack([goal_tfidf, goal_struct])

# save combined if desired
joblib.dump(exercise_matrix, "exercise_matrix.joblib")
joblib.dump(goal_matrix, "goal_matrix.joblib")

['goal_matrix.joblib']

In [119]:
# Step 10: simple recommender using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

def recommend(goal_name, top_n=6, use_structured=True):
    # find goal index
    if goal_name not in goals["goal_name"].values:
        raise ValueError("goal not found")
    idx = goals[goals["goal_name"] == goal_name].index[0]
    # choose which matrices to use
    if use_structured:
        gvec = goal_matrix[idx:idx+1]
        sims = cosine_similarity(gvec, exercise_matrix)[0]
    else:
        gvec = goal_tfidf[idx:idx+1]
        sims = cosine_similarity(gvec, exercise_tfidf)[0]

    top_idx = sims.argsort()[-top_n:][::-1]
    recs = exercises.loc[top_idx, ["exercise_id","exercise_name","type","target_muscles","primary_attributes","intensity_level","equipment_needed","category"]].copy()
    recs["score"] = sims[top_idx]
    return recs.reset_index(drop=True)

# example
print("SCRUM POWER:\n ",recommend("scrum power", top_n=6))


SCRUM POWER:
     exercise_id   exercise_name    type         target_muscles  \
0           28  overhead squat  Weight  legs, core, shoulders   
1            1     front squat  Weight             legs, core   
2            2      back squat  Weight             legs, core   
3           17  dumbbell lunge  Weight             legs, core   
4            4       sled push  Weight  legs, core, shoulders   
5           26     barbell row  Weight  back, shoulders, core   

  primary_attributes  intensity_level equipment_needed    category     score  
0          stability                9          Barbell       power  0.734572  
1           strength                9          Barbell       power  0.650489  
2           strength                9          Barbell       power  0.617113  
3           strength                7          Dumbell  lower body  0.605132  
4      explosiveness                8             Sled       power  0.552920  
5           strength                8          Barbell 

In [120]:
# Step 11: save processed CSVs and encoders
exercises.to_csv("exercises_processed.csv", index=False)
goals.to_csv("goals_processed.csv", index=False)
joblib.dump(mlb_muscles, "mlb_muscles.joblib")
joblib.dump(mlb_attrs, "mlb_attrs.joblib")
# vectorizer and matrices already saved


['mlb_attrs.joblib']