# Observed Markov Models for new pipeline

Translated from R as written here: https://github.com/prio-data/viewsforecasting/tree/main/Tools/new_markov

## 0. Imports

In [1]:
import pandas as pd
import importlib


import auxiliaries
importlib.reload(auxiliaries)
from auxiliaries import *

## 1. Import test dataset(s)

In [2]:
data = pd.read_parquet("data/testdataset.parquet")
data.reset_index(inplace=True)

## 2. Preprocess data

Variables:

In [3]:
# random seed
random_seed = 42

# data format parameters
target_column = "ged_sb"

# model parameters
EndOfHistory = 549  # September 2025
model_type = "rf"   # other option is "glm"
loa = "cm"          # other option is "pgm"

Filter out months later than current time

In [5]:
data = data[data["month_id"] <= EndOfHistory]

In [None]:
id_column = get_id_column(loa)

data = data.sort_values(by=["month_id", id_column]).groupby(id_column).head()

# shift target column by 1 to create "last month" feature

data["target_last_month"] = data[target_column].shift(1)

In [10]:
data.head()

Unnamed: 0,month_id,country_id,vdem_v2x_edcomp_thick,vdem_v2x_egal,vdem_v2x_execorr,vdem_v2x_frassoc_thick,vdem_v2x_gencs,vdem_v2x_gender,vdem_v2x_genpp,vdem_v2x_horacc,...,topic_ste_theta7_stock_t1_splag,topic_ste_theta8_stock_t1_splag,topic_ste_theta9_stock_t1_splag,topic_ste_theta10_stock_t1_splag,topic_ste_theta11_stock_t1_splag,topic_ste_theta12_stock_t1_splag,topic_ste_theta13_stock_t1_splag,topic_ste_theta14_stock_t1_splag,ged_sb,target_last_month
0,1,1,0.353,0.627,0.822,0.601,0.604,0.736,0.784,-0.683,...,0.123509,0.006294,0.005473,0.760456,0.011488,0.215711,0.061809,0.136928,,
1,1,2,0.288,0.632,0.253,0.535,0.598,0.603,0.512,0.26,...,0.073758,0.009374,0.009908,0.348538,0.020794,0.276371,0.042777,0.133248,,
2,1,3,0.767,0.639,0.172,0.887,0.466,0.639,0.608,1.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,1,4,0.793,0.625,0.483,0.904,0.831,0.722,0.481,1.02,...,0.223222,0.012155,0.017388,0.584386,0.057243,0.303166,0.079764,0.208623,,
4,1,5,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [12]:
# compute markov statex§
def get_markov_state(current_month, last_month):

    if current_month == 0:
        if last_month == 0:
            return "peace"
        elif last_month > 0:
            return "desc"
    elif current_month > 0:
        if last_month == 0:
            return "esc"
        elif last_month > 0:
            return "war"
    
    return "invalid"

In [17]:
# drop na values in ged_sb and target_last_month
data = data.dropna(subset=[target_column, "target_last_month"]).copy()

In [18]:
# map markov state function to data and get markov state column
data["markov_state"] = data.apply(lambda row: get_markov_state(row[target_column], row["target_last_month"]), axis=1)

In [19]:
data.head()

Unnamed: 0,month_id,country_id,vdem_v2x_edcomp_thick,vdem_v2x_egal,vdem_v2x_execorr,vdem_v2x_frassoc_thick,vdem_v2x_gencs,vdem_v2x_gender,vdem_v2x_genpp,vdem_v2x_horacc,...,topic_ste_theta8_stock_t1_splag,topic_ste_theta9_stock_t1_splag,topic_ste_theta10_stock_t1_splag,topic_ste_theta11_stock_t1_splag,topic_ste_theta12_stock_t1_splag,topic_ste_theta13_stock_t1_splag,topic_ste_theta14_stock_t1_splag,ged_sb,target_last_month,markov_state
20202,123,170,0.583,0.581,0.299,0.883,0.651,0.628,0.585,0.404,...,0.040354,0.051542,0.756323,0.246863,0.793047,0.319971,0.710248,0.0,0.0,peace
20364,124,163,0.255,0.079,0.393,0.459,0.231,0.251,0.336,0.267,...,0.060369,0.062717,0.694311,0.22973,0.468262,0.363676,0.67044,0.0,0.0,peace
20371,124,170,0.583,0.581,0.299,0.883,0.651,0.628,0.585,0.404,...,0.046185,0.064578,0.685865,0.26969,0.754156,0.340773,0.699484,0.0,0.0,peace
20532,125,163,0.255,0.079,0.393,0.459,0.231,0.251,0.336,0.267,...,0.060279,0.063939,0.674117,0.22116,0.599789,0.337942,0.605398,0.0,0.0,peace
20539,125,170,0.583,0.581,0.299,0.883,0.651,0.628,0.585,0.404,...,0.042028,0.060068,0.741979,0.259821,0.818083,0.313609,0.632737,0.0,0.0,peace


209