In [None]:
!pip install pyro-ppl

#Simulated child

In [None]:
from google.colab import drive
drive.mount('./drive') #This will request authentication to be able to load data.

In [None]:
#Seed imports (important that this is run only once per runtime)
import pyro
import torch
import numpy as np

SEED = 0
np.random.seed(SEED)
torch.manual_seed(SEED)

#11/21: http://docs.pyro.ai/en/stable/distributions.html?highlight=uniform#uniform
import torch.distributions
from torch.distributions.uniform import Uniform 
from torch.distributions.log_normal import LogNormal
from torch.distributions.categorical import Categorical

In [None]:
import pandas as pd
import os
from os.path import join
import sys

In [None]:
DATA_FOLDER = './drive/My Drive/urop/wong_urop/model'
#This will have to be changed depending on where your data is stored.
DATA_PATH = join(DATA_FOLDER, 'popular_words_shift.csv')

COUNTS_FOLDER = join(DATA_FOLDER, 'counts')

#11/8: managing the imports
#https://stackoverflow.com/questions/4383571/importing-files-from-different-folder
import sys
sys.path.insert(1, COUNTS_FOLDER)
import word2counts

#Data and simulated child

In [None]:
class WordData():
  """
  Stores and asks for data from the main word CSV.
  """
  def __init__(self, data_path, data_folder):
    """
    Assumes that the data-related calculations stored
       as calculated by the word2counts.py file.

    data_path (str) should point to the main words CSV
    data_folder (str) should point to the "counts" folder.
    """
    #Indexing for pg is determined by this function.
    self.pg_idx = np.load(join(data_folder, 'pg_idx.npy'))
    #The indexing here is (by words (parallel to main CSV), by pg pair).
    self.counts = np.load(join(data_folder, 'counts.npy'))

    self.word_csv = pd.read_csv(data_path)
    word_freqs = self.word_csv['Frequency'].to_numpy()

    self.word_freq_distr = Categorical(probs = torch.from_numpy(word_freqs))

  def get_pg_len(self):
    return self.pg_idx.shape[0]

  def get_words_len(self):
    return self.word_csv.shape[0]

  def __getitem__(self, idx):
    """
    Gives CSV information, an entry/entries in the DataFrame.
    Can index with either an integer or a list of integers.
    """
    if isinstance(idx, int):
      return self.word_csv.iloc[idx]
    if isinstance(idx, list):
      return self.word_csv[idx]

  def get_counts(self, idx):
    """
    Gives count information, a Numpy array.
    """
    return self.counts[idx]

  def sample_and_extract(self, num_samples):
    """
    Samples num_samples (int) words according the word probabilities
      and returns two things:
        (1) this_words_idx, array, the words (in CSV indexing) selected to be learned
        (2) the sum of pg counts, array, from the words,
              to be used in updating child's exposure.
    """

    #11/22: https://pytorch.org/docs/stable/tensors.html
    this_words_idx = self.word_freq_distr.sample(sample_shape = torch.Size([num_samples])).numpy()
    pg_update_counts = np.take(self.counts, this_words_idx, axis = 0)

    return this_words_idx, np.sum(pg_update_counts, axis = 0)

In [None]:
class Child():
  """
  Represents a child.
    Translates and is based on Nikasha's WebPPL program.
  Attributes:
    (1) lr, the learning rate of a child (alpha)
    (2) exposures, Tensor, the count of fictional exposures to certain pg pairs.
    (3) self.data, a pointer to the main words CSV file.
  """

  def __init__(self, data):
    """
    Inputs: data, a WordData object with word and count information.
    """
    self.data = data
    self.lr = Uniform(0, 1).sample()
    self.exposures = torch.zeros(self.data.get_pg_len(), dtype = int)
    #TODO: Ask which mean, stdev
    log_mean = np.log(self.data.get_words_len()//2)
    self.fictional_exposure_distr = LogNormal(log_mean, 0.1)
    #Indexing is in order of that of pg_idx.npy

  def learn_words(self):
    """
    Simulates one instance of exposure to words.
    Returns a Tensor of effective exposures over all pg pairs.
    """
    #Sample and "learn" new words

    this_fictional_exposure = round(self.fictional_exposure_distr.sample().item())
    word_sample_idxs, pg_update_counts = self.data.sample_and_extract(this_fictional_exposure)

    self.update_fictional_exposure(pg_update_counts)

    #Now, return the actual non-deterministic exposure
    this_effective_exposures = self.give_effective_exposure()
    return this_effective_exposures

  def update_fictional_exposure(self, pg_update_counts):
    """
    Increases fictional exposure for all pg pairs encountered.
    Inputs: pg_update_counts, the second result of sample_and_extract
    Returns: None, but mutates exposures
    """
    self.exposures += pg_update_counts

  def give_effective_exposure(self):
    """
    Returns Tensor, the effective exposure for each pg pair,
      which is the sum of fictional exposure number of LogNormal
        with mean = 1, stdev = 1.
    """

    #Ask if can represent distribution with non-one mean/stdev.
    #TODO: vectorize more?

    this_exposures = torch.zeros((self.data.get_pg_len()))

    for idx, exposure in enumerate(self.exposures):
      exposure = exposure.item()
      mean = torch.ones((exposure,))
      stdev = torch.ones(mean.shape) #TODO: ask for which stdev
      this_distr = LogNormal(mean, stdev)
      this_raw_exposures = this_distr.sample()

      this_exposures[idx] = this_raw_exposures.sum()
      #Across the exposure random variables.
      
    return this_exposures

#Main

In [None]:
MAIN_DATA = WordData(DATA_PATH, COUNTS_FOLDER)
test_child = Child(MAIN_DATA)