### Predictive Analysis

In [2]:
import warnings
warnings.filterwarnings('ignore') 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
#Using the cleaned dataset 
df=pd.read_csv('merged_olympics.csv')
#Feature engineering 
athlete_features=['Age', 'Height', 'Weight']
sport_features=['Sport']
country_features=['Team']
#Feature selection and preparation
X=df[athlete_features + sport_features + country_features]
def medal_to_binary(medal_str):
    if medal_str in ["Gold", "Silver", "Bronze"]:
        return 1
    else:
        return 0
df['Medal_Win']=df['Medal'].apply(medal_to_binary)
y=df['Medal_Win']
#Preprocessing categorical features
le=LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col]=le.fit_transform(X[col])
#Feature scaling
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
#Splitting into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
#Model definition and hyperparameter tuning
model=RandomForestClassifier(random_state=42)
#Hyperparameter tuning with GridSearchCV 
param_grid={'n_estimators': [100, 200], 'max_depth': [5, 10]}
grid_search=GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
model=grid_search.best_estimator_
model.fit(X_train, y_train)
#Prediction on test set
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8619941794724452

Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     46118
           1       0.82      0.06      0.12      7829

    accuracy                           0.86     53947
   macro avg       0.84      0.53      0.52     53947
weighted avg       0.86      0.86      0.81     53947



The accuracy score calculates the percentage of correct predictions made by the model on the test data. The classification report provides a more detailed breakdown of the model’s performance, which includes precision, recall, F1-score and support for each class(“Medal Win” and “No Medal Win”). 

This code offers a framework for developing a machine learning model that uses a Random Forest Classifier to forecast medal winners in Olympic data. It trains the model, assesses its performance, and completes the necessary data preparation procedures.