## Packages import

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## Creating dataset

In [None]:
# Loading the data
df_no_risk = pd.read_excel("modified_data.xlsx", sheet_name="Datanorisk")
df_risk = pd.read_excel("modified_data.xlsx", sheet_name="Datarisk")

## Creating sequences

In [None]:
# Preprocessing the data
def create_sequence_and_label(df, label): # Create sequences of 3 rows and assign a label
    sequences=[] # List to store sequences
    i = 0 # Initialize index
    while i < len(df): # Loop through the dataframe
        seq = df.iloc[i:i + 3].values.flatten() # Get 3 rows and flatten them into a single row
        sequences.append(seq) 
        i = i+3 # Move to the next sequence
    return pd.DataFrame(sequences).assign(label=label) # Assign the label

# Create sequences for both classes
df1 = create_sequence_and_label(df_no_risk, label=0)
df2 = create_sequence_and_label(df_risk, label=1)

original_cols = df_no_risk.columns.tolist() # Get the original column names

# Rename columns for 3-day sequences
column_names = (
    original_cols +
    [f"{col}_2" for col in original_cols] +
    [f"{col}_3" for col in original_cols] +
    ["label"]
)
df1.columns = column_names
df2.columns = column_names

## Oversampling and creating train-test sets

In [None]:
# Balancing the dataset
df2_oversampling = resample(df2, # Resampling label 1
                            replace=True, 
                            n_samples=1000, # Number of samples to generate
                            random_state=42)  # Random state for reproducibility

# Concatenate both datasets
df_balanced = pd.concat([df1, df2_oversampling])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Creating X and y variables for the model
X = df_balanced.drop('label', axis=1)
y = df_balanced['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3, # 30% for testing
                                                    random_state=42
                                                    ) 

## Modeling

In [None]:
# Creating the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, # Number of trees in the forest
                                  random_state=42) # Random state for reproducibility

# Fitting the model to the training data
rf_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Feature importances
importances = rf_model.feature_importances_
print("Feature Importances:", importances)