Create and save distributions for 500 odor entities

In [2]:
import pickle as pkl
import numpy as np

In [3]:
with open('binary_opens.pkl','rb') as readfile:
  # ** x is a dictionary, with idx: [chems]
  x = pkl.load(readfile)

  
# ** distributions is a dict with id: { means: [], covariances: [matrix] }
distributions = {}
for id in x.keys():
  odor = x[id]
  n = len(odor)
  odor = odor.reshape(-1, 1).astype(int)

  odor_distribution = {}
  mean = [np.random.random() * 10 + 30 if chem else 0 for chem in odor]
  odor_distribution["mean"] = mean

  ones_indices = [i for (i, v) in enumerate(odor) if v]
  cov = np.zeros((n, n))
  for i in ones_indices:
    for j in ones_indices:
      if i == j:
        cov[i][j] = np.random.random()
      else:
        cov[i][j] = np.random.random()*1.5-0.75
        cov[j][i] = cov[i][j]


  odor_distribution["covariances"] = np.dot(cov, cov.T) # for psd covariance matrix

  distributions[id] = odor_distribution

In [4]:
odor1 = np.random.multivariate_normal(distributions[2]["mean"], distributions[2]["covariances"], size=1)
odor1 = [v if abs(v) >= 0.01 else 0 for v in odor1[0]]
print(odor1)
for v in odor1:
  if v < 0:
    print(v)

[29.23448591172562, 33.82283519395624, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33.35131915112555, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34.66039135022656, 0, 0, 0, 0, 0, 0, 32.50270068215765, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37.23123676074444, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26.688670931951485, 0, 0, 0, 0, 36.379410515224926, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32.266731445022764, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31.625021637150716, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24.37147918842051, 0, 0, 0, 0, 37.845164562337644, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39.431757231944026, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33.688749029089976, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31.02996122488951, 0, 0, 47.75349028448042, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38.037940613069686, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Save distributions

In [5]:
with open('distributions.pkl', 'wb') as writefile: 
  pkl.dump(distributions, writefile)

In [3]:
with open('distributions.pkl','rb') as readfile:
  # ** x is a dictionary, with idx: [chems]
  distributions = pkl.load(readfile)

Generate training and validation data

In [47]:
import csv
import numpy as np
import concurrent.futures
from scipy.stats import matrix_normal

index_to_id = {}

# makes training data
def make_train_data(n):
    keys = list(distributions.keys())[:n]
    samples_per_mixture = 10

    with open('training.pkl', 'wb') as writefile:  # Open file in 'a' mode to append
        for i, id in enumerate(keys):
            print(id)
            for _ in range(samples_per_mixture):
                odor = np.random.multivariate_normal(distributions[id]["mean"], distributions[id]["covariances"], size=1)
                pkl.dump([odor, i], writefile)
                index_to_id[i] = id


# makes training data
def make_test_data(n):
    keys = list(distributions.keys())[:n]
    samples_per_mixture = 5

    with open('testing.pkl', 'wb') as writefile:  # Open file in 'a' mode to append
        for i, id in enumerate(keys):
            print(id)
            for _ in range(samples_per_mixture):
                odor = np.random.multivariate_normal(distributions[id]["mean"], distributions[id]["covariances"], size=1)
                pkl.dump([odor, i], writefile)


In [49]:
# NUMBER OF ODORS (classes), max 500
N = 100

make_train_data(N)
make_test_data(N)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
0
1
2
3
4
5
6
7
8
9
10
11
12
13
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
