In [1]:
# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5
# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3R.h5


In [2]:
import pickle
import gzip
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import preprocessing 

%matplotlib inline

In [3]:
mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb'))
feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'))
num_features = len(ordered_features)

In [4]:
total_observations = len(mendelian_errors)
error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))
ok_observations = total_observations - error_observations
fraction_errors = error_observations/total_observations
print (total_observations, ok_observations, error_observations, 100*fraction_errors)

10905732 10364044 541688 4.9670026734564905


In [5]:
#total_observations*fraction_errors
prob_ok_choice = error_observations / ok_observations

def accept_entry(row):
    if row[-1] == 1:
        return True
    return random.random() <= prob_ok_choice

accept_entry_v = np.vectorize(accept_entry, signature='(i)->()')

accepted_entries = accept_entry_v(feature_fit)

In [6]:
balanced_fit = feature_fit[accepted_entries]
del feature_fit
balanced_fit.shape
len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0])

(541688, 540490)

In [7]:
np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False)

In [8]:
[balanced_fit[:, column].mean() for column in range(num_features + 1)]

[2202.1343180142267,
 26.486503596858295,
 0.7413410732799964,
 27.331494449157162,
 6.1457497359660325,
 6920.343751865484,
 -0.21837156654619705,
 0.5005535133776514]

In [9]:
scaler = preprocessing.StandardScaler(copy=False)
scaler.fit(balanced_fit[:,:-1])
scaler.transform(balanced_fit[:,:-1])
#Original balanced fit is now lost
scaled_balanced_fit = balanced_fit # Better naming

In [11]:
[scaled_balanced_fit[:, column].mean() for column in range(num_features + 1)]

[2.906836767848834e-16,
 5.230362688083622e-17,
 4.384680144491894e-17,
 2.2796659870733538e-17,
 1.0184959329343855e-16,
 -1.1779296789538328e-13,
 -1.480601037111294e-17,
 0.5005535133776514]

In [12]:
np.save(gzip.open('scaled_balanced_fit.npy.gz', 'wb'), scaled_balanced_fit, allow_pickle=False, fix_imports=False)

In [13]:
scaled_balanced_fit.shape

(1082178, 8)

In [28]:
cv_indexes = np.random.random(scaled_balanced_fit.shape[0]) > 0.8

In [32]:
scaled_balanced_fit_cv = balanced_fit[cv_indexes]
scaled_balanced_fit_100_test = balanced_fit[~cv_indexes]

In [33]:
np.save(gzip.open('scaled_balanced_fit_cv.npy.gz', 'wb'), scaled_balanced_fit_cv, allow_pickle=False, fix_imports=False)
np.save(gzip.open('scaled_balanced_fit_100_test.npy.gz', 'wb'), scaled_balanced_fit_100_test, allow_pickle=False, fix_imports=False)

In [39]:
np.save(gzip.open('scaled_balanced_fit_10_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[
    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.1], allow_pickle=False, fix_imports=False)
np.save(gzip.open('scaled_balanced_fit_1_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[
    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.01], allow_pickle=False, fix_imports=False)