# Negative Sampling try-outs

## Data Generation

In [None]:
import sys
import csv
import numpy as np
import random
from src import util, logit
from networkx import nx

G = nx.erdos_renyi_graph(5000, 0.0025, seed=None, directed=False)
t = 0
path = "%s/choices/test_sampling.csv" % util.data_path
f = open(path, 'w')
writer = csv.writer(f)
writer.writerow(['t', 'i', 'j', 'deg', 'fof', 'u', 'y'])

while t < 1000:
    # sample node i who is going to choose
    i = util.random_sample(G.nodes())
    # compute utilities per alternative j
    util = {}
    degs = {}
    fofs = {}
    # for each alternative, compute features and raw utility
    for k in G.nodes():
        # skip already-neighbors
        if k == i or k in G.neighbors(i):
            continue
        # degree of alternative
        degs[k] = G.degree(k)
        # whether i and k have friends of friends in common
        fofs[k] = 1 if k in nx.ego_graph(G, i, 2).nodes() else 0
        # u_ik = 0.5 * log(deg_k) + 2 * fof_ik
        util[k] = (0.5 * np.log(degs[k] + util.log_smooth)) + (2 * fofs[k])
    # sample actual choice
    us = np.exp(list(util.values()))
    ps = [x / sum(us) for x in us]
    j = np.random.choice(list(util.keys()), size=1, p=ps)[0]
    # write copmlete choice sets
    for k in degs.keys():
        writer.writerow([t, i, k, degs[k], fofs[k], util[k], 1 if k == j else 0])
    # actually add the edge
    G.add_edge(i, j)
    t += 1

f.close()

## Model Fitting

In [1]:
import sys
sys.path.append('../')
from src import util, logit
import numpy as np
import pandas as pd
import csv

In [19]:
path = "%s/choices/test_sampling_v2_id0.csv" % util.data_path
D = pd.read_csv(path).rename(index=str, columns={"t": "choice_id"})
D.head()

Unnamed: 0,choice_id,i,j,deg,fof,u,y
0,0,3458,0,8,0,1.039721,0
1,0,3458,1,13,0,1.282475,0
2,0,3458,2,15,0,1.354025,0
3,0,3458,3,14,0,1.319529,0
4,0,3458,4,9,0,1.098612,0


In [20]:
m_all1 = logit.FeatureModel('id', D=D, vvv=1, features=['u'])
m_all1.fit()

[id] parameters after fitting: [1.02230731]


In [25]:
D['log_deg'] = np.log(D['deg'] + util.log_smooth)
m_all2 = logit.FeatureModel('id', D=D, vvv=1, features=['log_deg','fof'])
m_all2.fit()
m_all.se

[id] parameters after fitting: [0.47525839 1.84947975]


array([0.12688713, 0.09041842])

Now, let's do some negative sampling.

In [13]:
s = 10
Ds = D.query('y == 0') \
      .groupby('choice_id') \
      .apply(lambda x: x.sample(n=5)) \
      .append(D.query('y == 1'))
print(Ds.shape)
m = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'])
m.fit()

(3600, 10)
[id] parameters after fitting: [0.99513229]


In [18]:
Ds[['choice_id','i','j','deg','fof','u','y','score']].to_csv("%s/choices/test_sampling_down.csv" % util.data_path, index=False)

For $s \in [5, 10, 20, 50, 100]$, do uniform sampling 50 times, compute both utility model and parameter model.

In [None]:
res = []
for i in range(10):
    print(i)
    path = "%s/choices/test_sampling_v2_id%d.csv" % (util.data_path, i)
    D = pd.read_csv(path).rename(index=str, columns={"t": "choice_id"})
    D['log_deg'] = np.log(D['deg'] + util.log_smooth)
    # fit "all" models
    m1 = logit.FeatureModel('id', D=D, vvv=1, features=['u'])
    m2 = logit.FeatureModel('id', D=D, vvv=1, features=['log_deg','fof'])
    res.append([i, 0, 'all', m1.u[0], m2.u[0], m2.u[1]])
    # do sample 100 times
    for j in range(100):
        for s in [5, 10, 20, 50, 100, 500]:
            Ds = D.query('y == 0') \
                  .groupby('choice_id') \
                  .apply(lambda x: x.sample(n=s)) \
                  .append(D.query('y == 1'))
            m1 = logit.FeatureModel('id', D=Ds, vvv=0, features=['u'])
            m1.fit()
            m2 = logit.FeatureModel('id', D=Ds, vvv=0, features=['log_deg','fof'])
            m2.fit()
            res.append([i, j, s, m1.u[0], m2.u[0], m2.u[1]])

In [10]:
fn = '/Users/janovergoor/projects/choosing_to_grow/choose2grow/reports/sampling_data_py_v2.csv'
with open(fn, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['i', 's', 'theta_0', 'theta_1', 'theta_2'])
    for row in res:
        writer.writerow(row)

# Stratified sampling and adjustment

In [7]:
res = []
i = 9
s = 5
path = "../negative_sampling/test_sampling_v3_id%d.csv" % (i)
D = pd.read_csv(path).rename(index=str, columns={"t": "choice_id"})
D['log_deg'] = np.log(D['deg'] + util.log_smooth)

# fit "all" models
print("Fitting all")
m1 = logit.FeatureModel('id', D=D, vvv=1, features=['u'])
m1.fit()
m2 = logit.FeatureModel('id', D=D, vvv=1, features=['log_deg','fof'])
m2.fit()
res.append([i, 'all', m1.u[0], m1.se[0], m2.u[0], m2.se[0], m2.u[1], m2.se[1]])

# fit uniform sampling models
print("Fitting uniform")
Ds = D.query('y == 0').groupby('choice_id').apply(lambda x: x.sample(n=s*2)).append(D.query('y == 1'))
m1 = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'])
m1.fit()
m2 = logit.FeatureModel('id', D=Ds, vvv=1, features=['log_deg','fof'])
m2.fit()
res.append([i, 'unif', m1.u[0], m1.se[0], m2.u[0], m2.se[0], m2.u[1], m2.se[1]])

# fit stratified sampling models
print("Fitting stratified")
Ds = D.query('y == 0').query('fof == 0').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True)) \
      .append(D.query('y == 0').query('fof == 1').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True))) \
      .append(D.query('y == 1'))
m1 = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'])
m1.fit()
m2 = logit.FeatureModel('id', D=Ds, vvv=1, features=['log_deg','fof'])
m2.fit()
res.append([i, 'strat', m1.u[0], m1.se[0], m2.u[0], m2.se[0], m2.u[1], m2.se[1]])

# fit stratified sampling models w adjustment
print("Fitting stratified+adj")
D2 = D.assign( n = lambda x: x.groupby(['choice_id','fof'])['j'].transform('count')) # add relevant counts
Ds = D2.query('y == 0').query('fof == 0').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True)) \
      .append(D2.query('y == 0').query('fof == 1').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True))) \
      .append(D2.query('y == 1')) \
      .sort_values(by=['choice_id','y'])
Ds['w'] = np.where(Ds['y']==1, 1.0, float(s) / Ds['n'])

m1 = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'], sw='w')
m1.fit()
m2 = logit.FeatureModel('id', D=Ds, vvv=1, features=['log_deg','fof'], sw='w')
m2.fit()
res.append([i, 'strat_adj', m1.u[0], m1.se[0], m2.u[0], m2.se[0], m2.u[1], m2.se[1]])

fn = '/Users/janovergoor/projects/choosing_to_grow/choose2grow/reports/sampling_data_strat2.csv'
with open(fn, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['i', 's', 'mu_0', 'se_0', 'mu_1', 'se_1', 'mu_2', 'se_2'])
    for row in res:
        writer.writerow(row)


Fitting all
[id] parameters after fitting: [0.99958872]
[id] parameters after fitting: [0.47484475 5.00419637]
Fitting uniform
[id] parameters after fitting: [0.9875032]
[id] parameters after fitting: [0.57237396 4.90050077]
Fitting stratified
[id] parameters after fitting: [-0.02560416]
[id] parameters after fitting: [ 0.45674471 -0.30227014]
Fitting stratified+adj
[id] parameters after fitting: [0.99457014]
[id] parameters after fitting: [0.58535024 4.94890576]


In [2]:
i = 9
s = 5
path = "../negative_sampling/test_sampling_v3_id%d.csv" % (i)
D = pd.read_csv(path).rename(index=str, columns={"t": "choice_id"})

# compute count by group
D2 = D.assign( n = lambda x: x.groupby(['choice_id','fof'])['j'].transform('count'))
Ds = D2.query('y == 0').query('fof == 0').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True)) \
      .append(D2.query('y == 0').query('fof == 1').groupby('choice_id').apply(lambda x: x.sample(n=s, replace=True))) \
      .append(D2.query('y == 1')) \
      .sort_values(by=['choice_id','y'])
Ds['w'] = np.where(Ds['y']==1, 1.0, float(s) / Ds['n'])
#Ds['w'] = np.where(Ds['y']==1, 1.0 / Ds['n'], 5.0 / Ds['n'])
Ds.head(n=11)

Unnamed: 0,choice_id,i,j,deg,fof,u,y,n,w
"(0, 2123)",0,2594,2123,7,0,0.9729551,0,4992,0.001002
"(0, 4255)",0,2594,4256,2,0,0.3465736,0,4992,0.001002
"(0, 1957)",0,2594,1957,4,0,0.6931472,0,4992,0.001002
"(0, 1740)",0,2594,1740,4,0,0.6931472,0,4992,0.001002
"(0, 3547)",0,2594,3548,1,0,5e-09,0,4992,0.001002
"(0, 4197)",0,2594,4198,1,1,5.0,0,6,0.833333
"(0, 3236)",0,2594,3237,4,1,5.693147,0,6,0.833333
"(0, 294)",0,2594,294,7,1,5.972955,0,6,0.833333
"(0, 294)",0,2594,294,7,1,5.972955,0,6,0.833333
"(0, 3188)",0,2594,3189,6,1,5.89588,0,6,0.833333


In [6]:
m1 = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'])
m1.fit()
m1 = logit.FeatureModel('id', D=Ds, vvv=1, features=['u'], sw='w')
m1.fit()

[id] parameters after fitting: [-0.02767681]
[id] parameters after fitting: [0.99454735]


In [5]:
from imp import reload
logit=reload(logit)