In [1]:
!git clone https://github.com/infamoussoap/ConvexHull.git

Cloning into 'ConvexHull'...
remote: Enumerating objects: 636, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 636 (delta 112), reused 167 (delta 71), pack-reused 428[K
Receiving objects: 100% (636/636), 816.82 KiB | 7.56 MiB/s, done.
Resolving deltas: 100% (335/335), done.


In [2]:
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt

import os

In [3]:
import sys
sys.path.insert(0, 'ConvexHull')

In [4]:
from Optimizers.SampleWeighting import CauchySimplex, EGD, PairwiseFrankWolfe
from Optimizers.SampleWeighting.Distributions import TruncatedGaussian
from Optimizers.SampleWeighting.SampleWeighting import SampleWeighting

In [5]:
def theoretical_student_scores_std(p_values):
    num_students, num_questions = p_values.shape
    
    C = num_students * num_questions ** 2

    A = (np.sum(p_values * (1 - p_values)) + np.sum(p_values.T @ p_values)) / C
    B = (np.sum(p_values) / (num_students * num_questions)) ** 2

    return np.sqrt(A - B)


def theoretical_student_scores_mean(p_values):
    return np.mean(p_values)

In [6]:
def generate_student_scores(num_students=100, num_questions=100, 
                            easy_question_proportion=0.8, smart_student_proportion=0.6):
    # Question difficulty
    q_values = np.zeros(num_questions)
    n = int(num_questions * easy_question_proportion)

    q_values[:n] = 7 / 8
    q_values[n:] = 1 / 5

    # Student smartness
    s_values = np.zeros(num_students)
    n = int(num_students * smart_student_proportion)

    s_values[:n] = 4 / 5
    s_values[n:] = 1 / 2
    
    p_values = s_values[:, None] * q_values[None, :]
    
    return p_values

In [7]:
def test_optimizer(optimizer, w, max_iter=1_000, search_args=None, max_time=None):
    if search_args is None:
        search_args = {}
    
    max_time = max_time if max_time is not None else np.inf

    i = 0 
    start = time.time()
    while i < max_iter and time.time() - start < max_time:
        w = optimizer.search(w, **search_args)
        i += 1
        
    end = time.time()

    weighted_mean = np.mean(optimizer.data @ w)
    weighted_std = np.std(optimizer.data @ w)

    return i, end - start, optimizer.f(w), weighted_mean, weighted_std

In [8]:
num_trials = 25

max_iter = 150
max_time = np.inf  # seconds

num_students = 200
num_questions = 75

p_values = generate_student_scores(num_students=num_students, num_questions=num_questions)

In [9]:
mean = theoretical_student_scores_mean(p_values)
std = theoretical_student_scores_std(p_values)

print(f"Theoretical Mean = {mean:.4f}, Theoretical Std = {std:.4f}")

Theoretical Mean = 0.5032, Theoretical Std = 0.1206


In [10]:
integration_points = np.linspace(0, 1, 401)
target_distribution = TruncatedGaussian(0.5, 0.1, 0, 1)

In [11]:
cs_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance', 'Weighted Mean', 'Weighted Std'])
egd_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance', 'Weighted Mean', 'Weighted Std'])
pfw_results = pd.DataFrame(0.0, index=range(num_trials), columns=['Iterations', 'Time (sec)', 'Distance', 'Weighted Mean', 'Weighted Std'])

for i in range(num_trials):
    np.random.seed(i)

    samples = np.random.binomial(1, p_values)
    _, n = samples.shape

    # Test Cauchy Simplex
    optimizer = CauchySimplex(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    cs_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                           search_args={'gamma': 1}, max_time=max_time)

    # Test EGD
    optimizer = EGD(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    egd_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                            search_args={'step_size': 10}, max_time=max_time)

    # Test PFW
    optimizer = PairwiseFrankWolfe(samples, integration_points, target_distribution, e=0.05)
    w = np.ones(n) / n
    pfw_results.iloc[i, :] = test_optimizer(optimizer, w, max_iter=max_iter, 
                                            search_args={}, max_time=max_time)

    cs_results.to_csv(f"cs.csv")
    egd_results.to_csv(f"egd.csv")
    pfw_results.to_csv(f"pfw.csv")

print("FINISHED")

FINISHED


In [12]:
files = [x for x in os.listdir() if x[-3:] == 'csv']
dfs = []

for x in files:
    df = pd.read_csv(x, index_col=0)
    df['Optimizer'] = x.split('.')[0]
    
    dfs.append(df)
    
df = pd.concat(dfs)

In [13]:
grouped_distances = df.groupby("Optimizer")["Distance"]
optimizers = df['Optimizer'].unique()

distances = pd.concat([grouped_distances.get_group(x) for x in optimizers], axis=1)
distances.columns = optimizers

distances.head()

Unnamed: 0,egd,pfw,cs
0,0.032426,0.032114,0.032432
1,0.010535,0.010101,0.010349
2,0.016252,0.015848,0.016186
3,0.025726,0.025309,0.025684
4,0.020486,0.020213,0.020561


In [14]:
labels = distances.columns
pd.DataFrame(labels[np.argmin(distances.values, axis=1)]).value_counts()

pfw    25
dtype: int64

In [15]:
labels = distances.columns
pd.DataFrame(labels[np.argmax(distances.values, axis=1)]).value_counts()

cs     13
egd    12
dtype: int64

In [16]:
grouped_times = df.groupby("Optimizer")["Time (sec)"]
optimizers = df['Optimizer'].unique()

times = pd.concat([grouped_times.get_group(x) for x in optimizers], axis=1)
times.columns = optimizers

times.head()

Unnamed: 0,egd,pfw,cs
0,10.984295,6.017592,8.138298
1,5.079744,4.455963,5.946815
2,4.932542,5.501061,5.743837
3,6.186781,4.510577,4.61806
4,6.032439,4.503717,4.629465


In [17]:
labels = times.columns
pd.DataFrame(labels[np.argmin(times.values, axis=1)]).value_counts()

pfw    15
egd     7
cs      3
dtype: int64

In [18]:
labels = times.columns
pd.DataFrame(labels[np.argmax(times.values, axis=1)]).value_counts()

egd    13
cs      7
pfw     5
dtype: int64