In [None]:
!pip install pyro-ppl

#Simulated child

In [None]:
from google.colab import drive
drive.mount('./drive') #This will request authentication to be able to load data.

In [None]:
#Seed imports (important that this is run only once per runtime)
import pyro
import torch
import numpy as np

SEED = 0
np.random.seed(SEED)
torch.manual_seed(SEED)

#11/21: http://docs.pyro.ai/en/stable/distributions.html?highlight=uniform#uniform
import torch.distributions
from torch.distributions.uniform import Uniform 
from torch.distributions.log_normal import LogNormal
from torch.distributions.normal import Normal

import pyro.distributions
from pyro.distributions import *

In [None]:
import pandas as pd
import os
from os.path import join
import sys

import matplotlib.pyplot as plt

In [None]:
DATA_FOLDER = './drive/My Drive/urop/wong_urop/model'
#This will have to be changed depending on where your data is stored.
DATA_PATH = join(DATA_FOLDER, 'popular_words_shift.csv')

COUNTS_FOLDER = join(DATA_FOLDER, 'counts')

#11/8: managing the imports
#https://stackoverflow.com/questions/4383571/importing-files-from-different-folder
import sys
sys.path.insert(1, COUNTS_FOLDER)
import word2counts

#Old structure (Torch/class): Data and simulated child

In [None]:
class PGData():
  """
  Stores and asks for data from the PG pairs.
  """
  def __init__(self, data_path, data_folder):
    """
    Assumes that the data-related calculations stored
       as calculated by the word2counts.py file.

    data_path (str) should point to the main words CSV
    data_folder (str) should point to the "counts" folder.
    """
    #Indexing for pg is determined by this function.
    self.pg_idx = np.load(join(data_folder, 'pg_idx.npy'))
    #The indexing here is (by words (parallel to main CSV), by pg pair).

    #TODO: This is currently just calculating the sum
    #     over previously calculated word -> pg pair counts.
    #Will change to direct calculation of pg pairs later.
    word_pg_counts = np.load(join(data_folder, 'counts.npy'))
    self.freqs = torch.from_numpy(np.sum(word_pg_counts, axis = 0)).float()

  def __len__(self):
    return self.pg_idx.shape[0]

  def __getitem__(self, idx):
    """
    Gives the pg str and count.
    """
    return (self.pg_idx[idx], self.freqs[idx])

  def sample_and_extract(self, num_samples):
    """
    Samples num_samples (int) words according the pg probabilities
      and returns the vector of counts.
    """
    this_words_idx = torch.multinomial(self.freqs, num_samples)
    this_words_counts = torch.bincount(this_words_idx, minlength = len(self))
    
    return this_words_counts

In [None]:
class Child():
  """
  Represents a child.
    Translates and is based on Nikasha's WebPPL program.
  Attributes:
    (1) lr, the learning rate of a child (alpha)
    (2) fictional_exposure_distr, distribution
    (3) data, a pointer to the main words CSV file.
  """

  def __init__(self, data):
    """
    Inputs: data, a PGData.
    """
    self.data = data
    self.lr = Uniform(0, 1).sample()
    #TODO: Ask which mean, stdev
    mean = 50
    self.fictional_exposure_distr = LogNormal(np.log(mean), 0.1)
    #Indexing is in order of that of pg_idx.npy

  def learn_pg(self):
    """
    Simulates one instance of exposure to pg pairs.
    Returns a Tensor of effective exposures over all pg pairs.
    """
    #Sample and "learn" new words

    this_fictional_exposure = round(self.fictional_exposure_distr.sample().item())
    fictional_exposures = self.data.sample_and_extract(this_fictional_exposure)

    #Now, return the actual non-deterministic exposure
    this_effective_exposures = self.give_effective_exposure(fictional_exposures)
    return this_effective_exposures

  def give_effective_exposure(self, fictional_exposures):
    """
    Returns Tensor, the effective exposure for each pg pair
      based on fictional_exposures, a Tensor
    """
    stdev = 0.1 * torch.ones(len(self.data))
    this_exposures = torch.normal(fictional_exposures.float(), stdev)
    return this_exposures

#Main

In [None]:
MAIN_DATA = PGData(DATA_PATH, COUNTS_FOLDER)

#New structure: Pyro-based

In [None]:
def renormalized_probs(raw_freqs):
  """
  Returns the sequentially re-normalized probabilities,
    from left to right.
  Accepts:
    raw_freqs, a Numpy array of raw probabilities.
  Returns:
    new_freqs, a Torch Tensor
      where the probability at index i
        is the rennormalized probability for raw_freqs[i:].
  """

  new_freqs = np.zeros(raw_freqs.shape)
  
  for which_pg_idx in range(raw_freqs.shape[0]):
    this_raw_prob = raw_freqs[which_pg_idx]
    this_raw_distr = raw_freqs[which_pg_idx:]
    prob_sum = np.sum(this_raw_distr)

    new_freqs[which_pg_idx] = this_raw_prob / prob_sum

  return torch.from_numpy(new_freqs)
  
#For the PGSampler, I built around and rearranged Ivan's code.

class PGSampler():

  def __init__(self):
    """
    Assumes that the data-related calculations stored
       as calculated by the word2counts.py file.

    data_path (str) should point to the main words CSV
    data_folder (str) should point to the "counts" folder.
    """
    #Indexing for pg is determined by this function.
    self.pg_idx = np.load(join(data_folder, 'pg_idx.npy'))
    #The indexing here is (by words (parallel to main CSV), by pg pair).

    #TODO: This is currently just calculating the sum
    #     over previously calculated word -> pg pair counts.
    #Will change to direct calculation of pg pairs later.
    word_pg_counts = np.load(join(data_folder, 'counts.npy'))

    self.raw_freqs = torch.from_numpy(np.sum(word_pg_counts, axis = 0)).float()
    self.renorm_freqs = renormalized_probs(self.raw_freqs)

    self.num_exposed_distr = pyro.distributions.Normal(np.log(500), 0.1)
    self.NUM_PG_PAIRS = self.pg_idx.shape[0]

  def allocate_fictional_counts(n_total, observations):
    """
    Allocates pg pair fictional exposure in sample storage and gives array.
    Accepts:
      n_total, int, the number of pg pairs total.
      observations, the observations to condition on.
        (Input ignored for now.)
    Returns: 
      all_n_pg_exposed, a Numpy array with the counts per pg pair.
        Indices parallel to pg_idx.

      Also stores the following samples:
          (1) log_n_pg_exposed = the total number of words to which child exposed
          (2) {this_pg}_n_fictional_exposed = the exposures to the str {this_pg}, a pg pair.
    """
    #Binomial cascade. TODO: Ask if you had right concept.
    which_pg_idx = 0
    n = n_total #Remaining pg to allocate between Binomials.

    all_n_pg_exposed = np.zeros(self.NUM_PG_PAIRS)

    test_pg_counts = []
    #Allocate counts to pg pairs.
    while n_remaining != 0:
      p_pg = self.renorm_freqs[which_pg_idx]
      this_pg = self.pg_idx[which_pg_idx]

      # this is a model that replaces the binomial with its continuous approximation (from Central Limit Theorem)
      n_pg_exposed_distr = pyro.distributions.Normal(loc = n * p_pg, scale = math.sqrt(n * p_pg * (1 - p_pg)))
      n_pg_exposed = pyro.sample(f"{this_pg}_n_fictional_exposed", n_pg_exposed_distr)

      test_pg_counts.append(n_pg_exposed)
      #Updates
      n -= n_pg_exposed #Subtract the counts allocated to this pg pair.
      all_n_pg_exposed[which_pg_idx] = n_pg_exposed
      which_pg_idx += 1

    assert np.array(test_pg_counts) == all_n_pg_exposed,\
     "Did not store pg counts correctly."
     
    assert np.sum(all_n_pg_exposed) == n_total,\
     "Did not allocate pg counts correctly -- sum does not match total number of counts."

    return all_n_pg_exposed

  def calc_effective_counts(all_fictional_exposures):
    """
    Calculates and stores effective exposures.
    Inputs:
      all_fictional_exposures, a Numpy array.
    Returns:
      Stores all_fictional_exposures under effective_exposures
      n_pg_exposed, a Torch Tensor, the effective counts.
    """

    #TODO: Ask about the stdev here.
    #TODO: Ask about if this is correct usage of expectation sum.
    stdev = 0.1 * torch.ones(all_fictional_exposures.shape)
    n_pg_exposed_distr = pyro.distributions.Normal(loc = all_fictional_exposures,\
                                                   scale = stdev)
    
    n_pg_exposed = pyro.sample("effective_exposures", n_pg_exposed_distr)
    return n_pg_exposed

  def approximate_model(observations):
    """
    Samples an total exposure number and exposure numbers for each PG pair.
    Accepts:
      observations, the fictional exposure observations to condition on.
        (Input ignored for now.)
    Returns: 
      all_effective_exposures, a Numpy array with the effective counts per pg pair.
        Indices parallel to pg_idx.

      Also stores the following samples:
          (1) log_n_pg_exposed = the total number of words to which child exposed
          (2) {this_pg}_n_fictional_exposed = the exposures to the str {this_pg}, a pg pair.
    """

    log_n = pyro.sample("log_n_pg_exposed", self.num_exposed_distr)
    n_total = torch.exp(log_n)

    all_fictional_exposures = allocate_fictional_counts(n_total, observations)
    all_effective_exposures = calc_effective_counts(all_fictional_exposures)

    return all_effective_exposures

#Verifications

In [None]:
def check_renormalized_prob():
  test_freqs = np.array([1, 0, 0, 7, 3])
  expected_renorm = np.array([1/11, 0, 0, 0.7, 1])

  actual_renorm = renormalized_probs(test_freqs)

  #Note: this test does not account for float behavior,
  #   so false positives for test failures may occur,
  #   but it should detect unexpected behavior.
  assert np.all(expected_renorm == actual_renorm.numpy()),\
   'Rennormalization expected behavior failed.' 

def run_tests():
  tests = [
           check_renormalized_prob
  ]

  for test in tests:
    test()

run_tests()