# Time to use hand rankings to make some ML model to "rank" markets

In [None]:
# get markets
import pandas as pd 
# these are hand ranked markets 
df_ranked = pd.read_excel("../../Desktop/random_20_markets.xlsx", header=1)

# get markets that are labeled
import sqlite3
df = pd.read_sql("SELECT * FROM markets WHERE id IN (SELECT market_id FROM market_labels)", con=sqlite3.connect('../../indexcast-backend/indexcast.db'))

# get the scores
market_relevance_scores = pd.read_sql("SELECT * FROM market_relevance_scores", con=sqlite3.connect('../../indexcast-backend/indexcast.db'))



In [None]:
# create a mapping dictionary
score_labels = {
    1: 'volume_total',
    2: 'volume_24h_y',
    3: 'volume_144h',
    4: 'num_traders',
    5: 'num_comments',
    6: 'temporal_relevance',
    7: 'geographical_relevance',
    8: 'index_question_relevance'
}

# map the score_type to its label
market_relevance_scores['score_label'] = market_relevance_scores['score_type_id'].map(score_labels)


In [3]:

# pivot the scores table
scores_pivot = market_relevance_scores.pivot_table(
    index='market_id',
    columns='score_label',
    values='score_value'
).reset_index()

# merge with markets DataFrame
markets_with_scores = df.merge(
    scores_pivot,
    left_on='id',
    right_on='market_id',
    how='left'
)
markets_with_scores.drop(columns='market_id', inplace=True)

In [None]:
# select only the relevant columns from markets_with_scores
relevance_cols = ['question', 'geographical_relevance', 'index_question_relevance', 'temporal_relevance']
relevance_data = markets_with_scores[relevance_cols]

# merge with df_ranked using the "question" column
df_ranked = df_ranked.merge(
    relevance_data,
    on='question',
    how='left'
)


In [None]:
# higher score = more relevant
df_ranked['Rankings'] = df_ranked['Rankings'].max() - df_ranked['Rankings'] + 1


In [None]:
df_ranked.rename(columns={'volume_24h': 'volume_24h_y'}, inplace=True)

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

features = [
    'temporal_relevance', 'geographical_relevance', 'index_question_relevance'
]

X_train = df_ranked[features]
y_train = df_ranked['Rankings']

model = make_pipeline(
    StandardScaler(),
    MLPRegressor(hidden_layer_sizes=(64, 32), activation='tanh',
             alpha=0.001, learning_rate='adaptive', max_iter=10000,
             random_state=42)
)

model.fit(X_train, y_train)

import joblib
# dump to add to the index cast backend
joblib.dump(model, 'pre-prompting-relevance-filter.joblib')


['pre-prompting-relevance-filter.joblib']

# testing

In [18]:
markets_with_scores['predicted_score'] = model.predict(markets_with_scores[features])
markets_with_scores = markets_with_scores.sort_values(by='predicted_score', ascending=False)
for index, market in markets_with_scores.iterrows():
    print(market["question"] + "\n" + str(market["predicted_score"]))


Will there be 1000 or more human cases of H5N1 in the US by the end of 2025?
20.111665516858938
Will 1000 herds of livestock be infected by H5N1 by the end of 2025 in the USA?
20.026651793916486
H5N1 pandemic occurs before 2026, and the case fatality rate is <5%
19.989242612170006
Will there be more than 1,000 confirmed human cases of H5N1 bird flu in the US by the end of 2025?
19.979603442358567
Will there be a "large-scale bird flu outbreak" (100+ human confirmed H5N1 cases) in the US by the end of 2025?
19.952520627190356
Will the CDC  announce that an epidemic of H5N1 "swine flu" exists anywhere in the United States  during 2025?
19.865704958799615
Will there be 10k or more human cases of H5N1 in the US by the end of 2025?
19.583294105703413
Will the CDC announce that a "bird flu" (H5N1) epidemic  exists anywhere in ther US during 2025?
19.461672107092177
Bird flu added to the WHO's list of emergencies this year?
19.42011352168088
Will there be 100 or more human cases of H5N1 in th

It looks like all relevant markets are scored 13 or higher. To give us a buffer, I propose those scoring greater than 12 should be eligible for rule generation

In [14]:
import numpy as np 
market_features = np.array([[0.3, 0.42, 0.1]]) 

# Predict the ranking
predicted_ranking = model.predict(market_features)
print(f"Predicted ranking: {predicted_ranking[0]}")

Predicted ranking: 7.401685758375199


