In [1]:
# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5
# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3R.h5


In [1]:
import pickle
import gzip
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import preprocessing 

%matplotlib inline

In [2]:
mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb'))
feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'))
num_features = len(ordered_features)

In [3]:
total_observations = len(mendelian_errors)
error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))
ok_observations = total_observations - error_observations
fraction_errors = error_observations/total_observations
print (total_observations, ok_observations, error_observations, 100*fraction_errors)

10905732 10364044 541688 4.9670026734564905


In [4]:
scaler = preprocessing.StandardScaler(copy=False)
scaler.fit(feature_fit[:,:-1])
scaler.transform(feature_fit[:,:-1])
#Original feature fit is now lost
np.save(gzip.open('scaled_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False)

In [5]:
feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))

In [6]:
#total_observations*fraction_errors
prob_ok_choice = error_observations / ok_observations

def accept_entry(row):
    if row[-1] == 1:
        return True
    return random.random() <= prob_ok_choice

accept_entry_v = np.vectorize(accept_entry, signature='(i)->()')

accepted_entries = accept_entry_v(feature_fit)

In [7]:
balanced_fit = feature_fit[accepted_entries]
del feature_fit
balanced_fit.shape
len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0])

(541688, 542763)

In [8]:
np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False)

In [9]:
[balanced_fit[:, column].mean() for column in range(num_features + 1)]

[23296.14492494359,
 354.4712531963178,
 2202.834823334572,
 26.394092978247084,
 0.7403727784842284,
 6.4208739266266965,
 48.25709298630713,
 27.505034344566976,
 6.127750555447494,
 6888.283798799059,
 -0.21749126743075584,
 0.49950435750439626]

In [10]:
#scaler = preprocessing.StandardScaler(copy=False) #Do not do this
#scaler.fit(balanced_fit[:,:-1])  # especially this!!!!
scaler.transform(balanced_fit[:,:-1])
#Original balanced fit is now lost
scaled_balanced_fit = balanced_fit # Better naming

In [11]:
[scaled_balanced_fit[:, column].mean() for column in range(num_features + 1)]

[-0.11944459975653862,
 0.055755755941762714,
 -0.12669210827118677,
 0.49732652814859485,
 0.0707018260772035,
 0.20941068744381267,
 -0.19261029852554243,
 0.054477874701126534,
 0.9021480612348493,
 0.5987402725495994,
 -0.10209079927433425,
 0.49950435750439626]

In [12]:
np.save(gzip.open('scaled_balanced_fit.npy.gz', 'wb'), scaled_balanced_fit, allow_pickle=False, fix_imports=False)

In [13]:
scaled_balanced_fit.shape

(1084451, 12)

In [14]:
cv_indexes = np.random.random(scaled_balanced_fit.shape[0]) > 0.8

In [15]:
scaled_balanced_fit_cv = balanced_fit[cv_indexes]
scaled_balanced_fit_100_test = balanced_fit[~cv_indexes]

In [16]:
np.save(gzip.open('scaled_balanced_fit_cv.npy.gz', 'wb'), scaled_balanced_fit_cv, allow_pickle=False, fix_imports=False)
np.save(gzip.open('scaled_balanced_fit_100_test.npy.gz', 'wb'), scaled_balanced_fit_100_test, allow_pickle=False, fix_imports=False)

In [17]:
np.save(gzip.open('scaled_balanced_fit_10_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[
    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.1], allow_pickle=False, fix_imports=False)
np.save(gzip.open('scaled_balanced_fit_1_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[
    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.01], allow_pickle=False, fix_imports=False)