# Negative Sampling try-outs

## Data Generation

In [None]:
import sys
import csv
import numpy as np
import random
from src import util, logit
from networkx import nx

G = nx.erdos_renyi_graph(5000, 0.0025, seed=None, directed=False)
t = 0
path = "%s/choices/test_sampling.csv" % util.data_path
f = open(path, 'w')
writer = csv.writer(f)
writer.writerow(['t', 'i', 'j', 'deg', 'fof', 'u', 'y'])

while t < 1000:
    # sample node i who is going to choose
    i = util.random_sample(G.nodes())
    # compute utilities per alternative j
    util = {}
    degs = {}
    fofs = {}
    # for each alternative, compute features and raw utility
    for k in G.nodes():
        # skip already-neighbors
        if k == i or k in G.neighbors(i):
            continue
        # degree of alternative
        degs[k] = G.degree(k)
        # whether i and k have friends of friends in common
        fofs[k] = 1 if k in nx.ego_graph(G, i, 2).nodes() else 0
        # u_ik = 0.5 * log(deg_k) + 2 * fof_ik
        util[k] = (0.5 * np.log(degs[k] + util.log_smooth)) + (2 * fofs[k])
    # sample actual choice
    us = np.exp(list(util.values()))
    ps = [x / sum(us) for x in us]
    j = np.random.choice(list(util.keys()), size=1, p=ps)[0]
    # write copmlete choice sets
    for k in degs.keys():
        writer.writerow([t, i, k, degs[k], fofs[k], util[k], 1 if k == j else 0])
    # actually add the edge
    G.add_edge(i, j)
    t += 1

f.close()

## Model Fitting

In [30]:
import sys
sys.path.append('../')
from src import util, logit
import numpy as np
import pandas as pd
import csv

In [2]:
path = "%s/choices/test_sampling.csv" % util.data_path
D = pd.read_csv(path).rename(index=str, columns={"t": "choice_id", 'u': 'X'})
D.head()

Unnamed: 0,choice_id,i,j,deg,fof,X,y
0,1000.0,776,0,10,0,1.151293,0
1,1000.0,776,1,8,0,1.039721,0
2,1000.0,776,2,14,0,1.319529,0
3,1000.0,776,3,6,0,0.89588,0
4,1000.0,776,4,11,0,1.198948,0


In [31]:
m_all = logit.FeatureModel('id', D=D, vvv=1)
m_all.fit()

[id] parameters after fitting: [1.00616609]


Now, let's do some negative sampling.

In [22]:
s = 10
Ds = D.query('y == 0') \
      .groupby('choice_id') \
      .apply(lambda x: x.sample(n=5)) \
      .append(D.query('y == 1'))
print(Ds.shape)
m = logit.FeatureModel('id', D=Ds, vvv=1)
m.fit()

(3600, 10)


In [29]:
res = []
for i in range(30):
    print(i)
    for s in [5, 10, 20, 50, 100, 500]:
        Ds = D.query('y == 0') \
              .groupby('choice_id') \
              .apply(lambda x: x.sample(n=s)) \
              .append(D.query('y == 1'))
        m1 = logit.FeatureModel('id', D=Ds, vvv=0)
        m1.fit()
        res.append([i, s, m1.u[0]])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [35]:
fn = '/Users/janovergoor/projects/choosing_to_grow/choose2grow/reports/sampling_data_py.csv'
with open(fn, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['i', 's', 'theta_0'])
    writer.writerow(['0', 'all', m_all.u[0]])
    for row in res:
        writer.writerow(row)