## Packages import

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import resample
import pandas as pd

## Creating dataset

In [None]:
# Loading the data
df_no_risk = pd.read_excel("modified_data.xlsx", sheet_name="Datanorisk")
df_risk = pd.read_excel("modified_data.xlsx", sheet_name="Datarisk")

## Creating sequences

In [None]:
# Preprocessing the data
def create_sequence_and_label(df, label): # Create sequences of 3 rows and assign a label
    sequences=[] # List to store sequences
    i = 0 # Initialize index
    while i < len(df): # Loop through the dataframe
        seq = df.iloc[i:i + 3].values.flatten() # Get 3 rows and flatten them into a single row
        sequences.append(seq) 
        i = i+3 # Move to the next sequence
    return pd.DataFrame(sequences).assign(label=label) # Assign the label

# Create sequences for both classes
df1 = create_sequence_and_label(df_no_risk, label=0)
df2 = create_sequence_and_label(df_risk, label=1)

original_cols = df_no_risk.columns.tolist() # Get the original column names

# Rename columns for 3-day sequences
column_names = (
    original_cols +
    [f"{col}_2" for col in original_cols] +
    [f"{col}_3" for col in original_cols] +
    ["label"]
)
df1.columns = column_names
df2.columns = column_names

## Oversampling and creating train-test sets

In [None]:
# Balancing the dataset
df2_oversampling = resample(df2, # Resampling label 1
                            replace=True, 
                            n_samples=1000, # Number of samples to generate
                            random_state=42)  # Random state for reproducibility

# Concatenate both datasets
df_balanced = pd.concat([df1, df2_oversampling])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Creating X and y variables for the model
X = df_balanced.drop('label', axis=1)
y = df_balanced['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2, # 30% for testing
                                                    random_state=42
                                                    ) 

## Modeling

In [None]:
# Creating the Random Forest model with hyperparameter tuning
param_dist = {
    'n_estimators': randint(10, 1000), # Randomly choose between 10 and 1000 trees
    'max_depth': [None, 10, 20, 30, 50], # Randomly choose between None and various depths
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None], # Number of features to consider when looking for the best split
    'bootstrap': [True, False], # Whether bootstrap samples are used when building trees
    'class_weight': [None, 'balanced'] # Weights associated with classes in the form {class_label: weight}
}

# RandomizedSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)

rand_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of different combinations to try
    cv=5, # 5-fold cross-validation
    verbose=1,
    n_jobs=-1,
    scoring='f1'  # Use F1 score as the scoring metric
)

# Fit the model
rand_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best params:", rand_search.best_params_)
print("Best score:", rand_search.best_score_)

In [None]:
# Creating the Random Forest model
rf_model = RandomForestClassifier(n_estimators=18,
                                  class_weight = None,
                                  max_depth = 30,
                                  max_features = "log2",
                                  min_samples_leaf = 1,
                                  min_samples_split = 2,
                                  random_state = 42) # Random state for reproducibility

# Fitting the model to the training data
rf_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

In [None]:
# Feature importance
importance = rf_model.feature_importance_
print("Feature Importance:", importance)

df_feat = pd.DataFrame({'feature': X.columns, 'importance': importance}).sort_values('importance', ascending=False)
df_feat