## testing with any UCDP data 

(inputs are data and window size)

In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

filename = '../../data/ucdp/somalia.csv'
WINDOW_SIZE = 6



In [1]:
def create_window_with_time(data, timestamps, window_size):
    X, y, y_timestamps = [], [], []
    for i in range(len(data) - window_size):
        X.append(data.iloc[i:(i + window_size)].values)
        y.append(data.iloc[i + window_size])
        y_timestamps.append(timestamps.iloc[i + window_size])
    return np.array(X), np.array(y), np.array(y_timestamps)


In [2]:

class WindowRandomForest:
    def __init__(self, filename, window_size):
        self.window_size = window_size
        self.filename = filename
        self.data = None
        self.predictions = None
        self.X_test = None
        self.y_test = None
        self.y_timestamps_test = None
        
    def load_data(self):
        self.data = pd.read_csv(self.filename)
        self.data.sort_values(by='date_start', inplace=True)
        
    def compute_MSE(self): 
        times = pd.to_datetime(self.data['date_start']).dt.tz_localize('Africa/Juba').dt.tz_convert('UTC')

        X, y, y_timestamps = create_window_with_time(self.data['best'], times, self.window_size)

        X_train, self.X_test, y_train, self.y_test, y_timestamps_train, self.y_timestamps_test = train_test_split(X, y, y_timestamps, test_size=0.2, random_state=42)
       
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        self.predictions = model.predict(self.X_test)

        mse = mean_squared_error(self.y_test, self.predictions)
        return mse
    
    def visualize_predictions(self):
        # Convert y_timestamps_test to a numpy array for indexing purposes
        y_timestamps_test_np = np.array(self.y_timestamps_test)
        sorted_indices = np.argsort(y_timestamps_test_np)

        # Use the sorted indices to sort X_test, y_test, and y_timestamps_test
        X_test_sorted = self.X_test[sorted_indices]
        y_test_sorted = self.y_test[sorted_indices]
        predictions_sorted = self.predictions[sorted_indices]
        y_timestamps_test_sorted = y_timestamps_test_np[sorted_indices]

        # Example conversion to a date string if y_timestamps_test is a pandas datetime series
        y_timestamps_test_str = [timestamp.strftime('%Y-%m-%d') for timestamp in y_timestamps_test_sorted]

        plt.figure(figsize=(10, 6))

        indices = range(len(self.y_test))
        plt.plot(indices, y_test_sorted, label='Actual', marker='o')
        plt.plot(indices, predictions_sorted, label='Predicted', marker='x')

        plt.title('Actual vs Predicted Values')
        plt.xlabel('Timestamp')
        plt.ylabel('Value')
        plt.legend()

        n = len(self.y_test) // 30
        plt.xticks(indices[::n], y_timestamps_test_str[::n], rotation=45, ha="right")

        plt.tight_layout()
        plt.show()
