In [17]:
import pandas as pd
import json

# Load the data
file_path = 'problems MoonBoard Masters 2017 40.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Normalize the data and convert to DataFrame
df = pd.json_normalize(data['data'])

# Extract holds and grades
df['holds'] = df['moves'].apply(lambda x: [move['description'] for move in x])
df['grade'] = df['grade']

# Display the relevant columns
df[['name', 'holds', 'grade']].head()

Unnamed: 0,name,holds,grade
0,Fat Guy In A Little Suit,"[J4, G6, F7, B8, B9, A11, D15, A16, F18]",6B+
1,White Lines,"[E6, I5, H10, K13, I16, H18]",6C
2,Allez!!!,"[F5, D9, E11, D15, H15, H18]",6B+
3,"TO JUG, OR NOT TO JUG...","[D15, D9, F12, F5, G13, H18, H10]",6A+
4,DEATH STAR,"[D18, D15, G13, H11, I4, J6, K9]",6C


In [18]:
from sklearn.preprocessing import MultiLabelBinarizer

# One-hot encode the holds
mlb = MultiLabelBinarizer()
hold_features = mlb.fit_transform(df['holds'])

# Create a DataFrame for the hold features
hold_df = pd.DataFrame(hold_features, columns=mlb.classes_)

# Combine the one-hot encoded holds with the original DataFrame
df = pd.concat([df, hold_df], axis=1)

# Drop the original holds column as it's now encoded
df = df.drop(columns=['holds'])

# Display the resulting DataFrame
df.head()

Unnamed: 0,name,grade,userGrade,setbyId,setby,method,userRating,repeats,isBenchmark,isMaster,...,K17,K18,K2,K3,K4,K5,K6,K7,K8,K9
0,Fat Guy In A Little Suit,6B+,6B+,55C65799-AB21-496C-A34E-1CC4B3210B27,Kyle Knapp,Feet follow hands,4,199,False,False,...,0,0,0,0,0,0,0,0,0,0
1,White Lines,6C,6C,55C65799-AB21-496C-A34E-1CC4B3210B27,Kyle Knapp,Feet follow hands,4,339,False,False,...,0,0,0,0,0,0,0,0,0,0
2,Allez!!!,6B+,6B+,55C65799-AB21-496C-A34E-1CC4B3210B27,Kyle Knapp,Feet follow hands,4,174,False,False,...,0,0,0,0,0,0,0,0,0,0
3,"TO JUG, OR NOT TO JUG...",6A+,6A+,55C65799-AB21-496C-A34E-1CC4B3210B27,Kyle Knapp,Feet follow hands,4,26153,True,False,...,0,0,0,0,0,0,0,0,0,0
4,DEATH STAR,6C,6C,55C65799-AB21-496C-A34E-1CC4B3210B27,Kyle Knapp,Feet follow hands,4,8274,True,False,...,0,0,0,0,0,0,0,0,0,1


In [19]:
from sklearn.preprocessing import LabelEncoder

# Encode the grades into numerical labels
le = LabelEncoder()
df['grade_encoded'] = le.fit_transform(df['grade'])

# Display the unique grade labels
list(le.classes_)

['6A+',
 '6B',
 '6B+',
 '6C',
 '6C+',
 '7A',
 '7A+',
 '7B',
 '7B+',
 '7C',
 '7C+',
 '8A',
 '8A+',
 '8B',
 '8B+']

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Extract holds and grade_encoded columns
df['holds'] = df['moves'].apply(lambda x: [move['description'] for move in x])

# Step 2: One-hot encode the holds
mlb = MultiLabelBinarizer()
hold_features = mlb.fit_transform(df['holds'])

# Step 3: Create a DataFrame from the one-hot encoded features
hold_df = pd.DataFrame(hold_features, columns=mlb.classes_)

# Step 4: Concatenate the one-hot encoded hold features to the original DataFrame
df = pd.concat([df, hold_df], axis=1)

# Step 5: Drop columns we don't need (non-numerical columns like 'moves', 'holds')
df = df.drop(columns=['holds', 'moves', 'setby', 'setbyId', 'name', 'moonBoardConfigurationId'])

# Step 6: Check the updated DataFrame structure
df.head()


Unnamed: 0,grade,userGrade,method,userRating,repeats,isBenchmark,isMaster,upgraded,downgraded,holdsets,...,K17,K18,K2,K3,K4,K5,K6,K7,K8,K9
0,6B+,6B+,Feet follow hands,4,199,False,False,False,False,"[{'description': 'Hold Set A', 'locations': No...",...,0,0,0,0,0,0,0,0,0,0
1,6C,6C,Feet follow hands,4,339,False,False,False,False,"[{'description': 'Hold Set A', 'locations': No...",...,0,0,0,0,0,0,0,0,0,0
2,6B+,6B+,Feet follow hands,4,174,False,False,False,False,"[{'description': 'Hold Set A', 'locations': No...",...,0,0,0,0,0,0,0,0,0,0
3,6A+,6A+,Feet follow hands,4,26153,True,False,False,False,"[{'description': 'Hold Set A', 'locations': No...",...,0,0,0,0,0,0,0,0,0,0
4,6C,6C,Feet follow hands,4,8274,True,False,False,False,"[{'description': 'Hold Set B', 'locations': No...",...,0,0,0,0,0,0,0,0,0,1


In [21]:
print(df.head())

  grade userGrade             method  userRating  repeats  isBenchmark  \
0   6B+       6B+  Feet follow hands           4      199        False   
1    6C        6C  Feet follow hands           4      339        False   
2   6B+       6B+  Feet follow hands           4      174        False   
3   6A+       6A+  Feet follow hands           4    26153         True   
4    6C        6C  Feet follow hands           4     8274         True   

   isMaster  upgraded  downgraded  \
0     False     False       False   
1     False     False       False   
2     False     False       False   
3     False     False       False   
4     False     False       False   

                                            holdsets  ...  K17  K18 K2 K3 K4  \
0  [{'description': 'Hold Set A', 'locations': No...  ...    0    0  0  0  0   
1  [{'description': 'Hold Set A', 'locations': No...  ...    0    0  0  0  0   
2  [{'description': 'Hold Set A', 'locations': No...  ...    0    0  0  0  0   
3  [{'descri

In [22]:
print(df.dtypes)

grade         object
userGrade     object
method        object
userRating     int64
repeats        int64
               ...  
K5             int64
K6             int64
K7             int64
K8             int64
K9             int64
Length: 415, dtype: object


In [43]:
# Step 1: Define Features (X) and Target (y)
X = df.drop(columns=['grade', 'grade_encoded','isMaster', 'upgraded','hasBetaVideo', 'dateInserted', 'downgraded', 
                     'isBenchmark','holdsetup.holdsets','userRating','repeats','apiId','holdsetup.apiId','holdsetup.description','dateUpdated','dateDeleted', 'userGrade', 'method', 'holdsets'])  # Drop non-numerical columns

y = df['grade_encoded']  # The target is the encoded grade

# Step 2: Check if X is numerical
print(X.dtypes)  # This should show only numerical (int64, float64) columns

# Step 3: Display the first few rows of X and y to verify
print("First few rows of X (features):")
print(X.head())

print("\nFirst few rows of y (target):")
print(y.head())

A1     int64
A10    int64
A11    int64
A12    int64
A13    int64
       ...  
K5     int64
K6     int64
K7     int64
K8     int64
K9     int64
Length: 396, dtype: object
First few rows of X (features):
   A1  A10  A11  A12  A13  A14  A15  A16  A17  A18  ...  K17  K18  K2  K3  K4  \
0   0    0    1    0    0    0    0    1    0    0  ...    0    0   0   0   0   
1   0    0    0    0    0    0    0    0    0    0  ...    0    0   0   0   0   
2   0    0    0    0    0    0    0    0    0    0  ...    0    0   0   0   0   
3   0    0    0    0    0    0    0    0    0    0  ...    0    0   0   0   0   
4   0    0    0    0    0    0    0    0    0    0  ...    0    0   0   0   0   

   K5  K6  K7  K8  K9  
0   0   0   0   0   0  
1   0   0   0   0   0  
2   0   0   0   0   0  
3   0   0   0   0   0  
4   0   0   0   0   1  

[5 rows x 396 columns]

First few rows of y (target):
0    2
1    3
2    2
3    0
4    3
Name: grade_encoded, dtype: int64


In [44]:
from sklearn.model_selection import train_test_split

# Step 1: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Check the shape of the split datasets
print("Training feature set shape (X_train):", X_train.shape)
print("Testing feature set shape (X_test):", X_test.shape)
print("Training target set shape (y_train):", y_train.shape)
print("Testing target set shape (y_test):", y_test.shape)


Training feature set shape (X_train): (38027, 396)
Testing feature set shape (X_test): (9507, 396)
Training target set shape (y_train): (38027,)
Testing target set shape (y_test): (9507,)


In [47]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Step 2: Train the model on the training data
rf_model.fit(X_train, y_train)

# Step 3: Check if the model has been trained by printing the model object
print("Random Forest model trained successfully!")


Random Forest model trained successfully!


In [48]:
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Step 2: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Step 3: Display a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.35
Classification Report:
              precision    recall  f1-score   support

         6A+       0.61      0.75      0.68      2570
          6B       0.23      0.14      0.17      1111
         6B+       0.24      0.24      0.24      1403
          6C       0.19      0.11      0.14       793
         6C+       0.23      0.27      0.25      1108
          7A       0.23      0.35      0.28       976
         7A+       0.18      0.18      0.18       618
          7B       0.17      0.07      0.09       303
         7B+       0.12      0.05      0.08       294
          7C       0.22      0.09      0.12       186
         7C+       0.21      0.05      0.08        85
          8A       0.12      0.03      0.05        33
         8A+       0.00      0.00      0.00         2
          8B       0.00      0.00      0.00         2
         8B+       0.00      0.00      0.00        23

    accuracy                           0.35      9507
   macro avg       0.18      0.16      0.1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
