In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
import numpy as np

In [3]:
# Load the dataset
file_path = 'new.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.iloc[:, 1:]  # Exclude the first column which is the target
y = df.iloc[:, 0]   # The first column is the target

# Split the data into training and validation sets
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Further split the training data for time series cross-validation
train_subset_size = int(len(X_train) * 0.5)
# Initialize the Random Forest model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize an empty list to store evaluation metrics for each iteration
results_rf = []

# Loop through the training data, expanding the training set and using the next 40 rows as the validation set
for i in range(train_subset_size, len(X_train) - 80, 80):
    # Define the current training and validation subsets
    X_train_current = X_train.iloc[:i]
    y_train_current = y_train.iloc[:i]
    X_val_current = X_train.iloc[i:i+80]
    y_val_current = y_train.iloc[i:i+80]
    
    # Fit the Random Forest model on the current training subset
    model_rf.fit(X_train_current, y_train_current)
    
    # Make predictions on the current validation subset
    y_pred_val_current = model_rf.predict(X_val_current)
    
    # # Adjust predictions for short selling
    # y_pred_val_current = np.where(y_pred_val_current < 0, -y_pred_val_current, y_pred_val_current)
    
    # Calcula te R2 statistic, Sharpe ratio, and accuracy on the current validation subset
    r2_current = r2_score(y_val_current, y_pred_val_current)
    print(y_val_current)
    print('------')
    print(y_pred_val_current)
    sharpe_ratio_current = np.mean(np.abs(y_pred_val_current)) / np.std(y_pred_val_current)
    accuracy_current = accuracy_score((y_val_current > 0).astype(int), (y_pred_val_current > 0).astype(int))
    mean_squared_error_current = mean_squared_error(y_val_current, y_pred_val_current)

    # Append the metrics to the results list
    results_rf.append({
        'Train Size': i,
        'R2 Statistic': r2_current,
        'Sharpe Ratio': sharpe_ratio_current,
        'Accuracy': accuracy_current,
        'Mean Squared Error': mean_squared_error_current
    })
    break

# Convert the results list to a DataFrame for easier viewing
results_rf_df = pd.DataFrame(results_rf)

# Display the results
results_rf_df



796    0.000316
797   -0.011055
798   -0.013095
799   -0.003236
800   -0.003896
         ...   
871   -0.009444
872    0.001021
873   -0.008503
874   -0.001715
875   -0.019588
Name: label, Length: 80, dtype: float64
------
[1.61183741e-03 3.06906538e-03 3.54473247e-03 9.76300204e-03
 7.50884440e-04 8.59920610e-04 9.16731650e-04 1.75408360e-04
 2.43845482e-03 5.20461090e-04 1.96837233e-03 2.79944610e-04
 3.41790763e-03 3.24606665e-03 2.41918952e-03 4.06762166e-03
 2.91230250e-04 2.19367192e-03 2.12564980e-03 1.78876609e-03
 1.28212719e-03 8.70448090e-04 8.15396130e-04 1.80087300e-04
 8.16784470e-04 2.47382794e-03 4.49745608e-03 4.67708000e-06
 3.48550410e-04 9.78841640e-04 3.63504090e-04 1.72844503e-03
 9.72929640e-04 4.72085130e-04 2.45895672e-03 7.34443520e-04
 2.44588754e-03 1.23889745e-03 9.05901270e-04 4.35818370e-04
 5.61911320e-04 1.01670811e-03 3.66464110e-04 2.49839070e-04
 3.64808940e-04 7.09644120e-04 2.51409224e-03 3.59586529e-03
 5.01848920e-04 1.30283067e-03 2.03301593e-03

Unnamed: 0,Train Size,R2 Statistic,Sharpe Ratio,Accuracy,Mean Squared Error
0,796,-0.299276,1.18713,0.425,6.4e-05
