In [2]:
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.insert(0, 'ConvexHull')

In [4]:
from Optimizers.SampleWeighting import CauchySimplex, EGD, PairwiseFrankWolfe
from Optimizers.SampleWeighting.Distributions import TruncatedGaussian
from Optimizers.SampleWeighting.SampleWeighting import SampleWeighting

In [5]:
def generate_student_scores(num_students=100, num_questions=100, 
                            easy_question_proportion=0.8, smart_student_proportion=0.6):
    # Question difficulty
    q_values = np.zeros(num_questions)
    n = int(num_questions * easy_question_proportion)

    q_values[:n] = np.random.rand(n) / 4 + 0.75
    q_values[n:] = np.random.rand(num_questions - n) / 3

    # Student smartness
    p_values = np.zeros(num_students)
    n = int(num_students * smart_student_proportion)

    p_values[:n] = np.random.rand(n) / 5 + 0.6
    p_values[n:] = np.random.rand(num_students - n) / 3 + 0.5

    return np.random.binomial(1, p_values[:, None] * q_values[None, :], size=(num_students, num_questions))

In [6]:
def test_optimizer(optimizer, w, max_iter=1_000, search_args=None, max_time=None):
    if search_args is None:
        search_args = {}
    
    max_time = max_time if max_time is not None else np.inf

    i = 0 
    start = time.time()
    while i < max_iter and time.time() - start < max_time:
        w = optimizer.search(w, **search_args)
        i += 1
        
    end = time.time()

    return i, end - start, optimizer.f(w)

In [14]:
num_trials = 25

max_iter = 200
max_time = np.inf  # seconds

num_students = 100
num_questions = 100

integration_points = np.linspace(0, 1, 401)
target_distribution = TruncatedGaussian(0.7, 0.1, 0, 1)

In [15]:
cs_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance'])
egd_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance'])
pfw_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance'])

for i in range(num_trials):
    np.random.seed(i)

    samples = generate_student_scores(num_students=num_students, num_questions=num_questions)
    _, n = samples.shape

    # Test Cauchy Simplex
    optimizer = CauchySimplex(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    cs_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                           search_args={'gamma': 1}, max_time=max_time)

    # Test EGD
    optimizer = EGD(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    egd_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                            search_args={'step_size': 10}, max_time=max_time)

    # Test PFW
    optimizer = PairwiseFrankWolfe(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    pfw_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                            search_args={}, max_time=max_time)

    cs_results.to_csv(f"cs.csv")
    egd_results.to_csv(f"egd.csv")
    pfw_results.to_csv(f"pfw.csv")

print("FINISHED")

FINISHED


In [16]:
import os

files = [x for x in os.listdir() if x[-3:] == 'csv']
dfs = []

for x in files:
    df = pd.read_csv(x, index_col=0)
    df['Optimizer'] = x.split('.')[0]
    
    dfs.append(df)
    
df = pd.concat(dfs)

In [17]:
grouped_distances = df.groupby("Optimizer")["Distance"]
# grouped_distances = df.groupby("Optimizer")["Time (sec)"]
optimizers = df['Optimizer'].unique()

distances = pd.concat([grouped_distances.get_group(x) for x in optimizers], axis=1)
distances.columns = optimizers

distances.head(10)

Unnamed: 0,pfw,cs,egd
0,0.004424,0.001794,0.260145
1,0.011773,0.010529,0.180865
2,0.052634,0.052777,0.052822
3,0.035278,0.035303,0.035337
4,0.021986,0.021948,0.022098
5,0.017548,0.017416,0.017428
6,0.058138,0.05822,0.058238
7,0.034463,0.034678,0.034529
8,0.008163,0.007002,0.188013
9,0.011971,0.009048,0.18528


In [18]:
labels = distances.columns
pd.DataFrame(labels[np.argmin(distances.values, axis=1)]).value_counts()

pfw    12
cs     11
egd     2
dtype: int64

In [20]:
labels = distances.columns
pd.DataFrame(labels[np.argmax(distances.values, axis=1)]).value_counts()

egd    20
pfw     3
cs      2
dtype: int64