In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset with gene expression levels, methylation status, and clinical features
data = pd.read_csv('fshd_data.csv')

# Define the input features and target variable
features = ['gene1', 'gene2', 'gene3', 'methylation_status', 'age', 'gender', 'initial_clinical_score']
target = 'DUX4_fl_expression'

X = data[features]
y = data[target]

# One-hot encode categorical variables (e.g., gender)
X = pd.get_dummies(X, columns=['gender'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Feature importance
importances = regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(10))
