In [15]:
import numpy as np
from google.colab import drive
import csv
import sklearn.model_selection
# makes printing more human-friendly
np.set_printoptions(precision=3,suppress=True)

#Function to process the dataset. Input the google drive path to the dataset as well as the percentage of train and test
#Function outputs the standardized train, test, val datasets and labels as well as the std and mean used to standardize
#Output is: X_train, y_train, X_val, y_val, X_test, y_test, mean, std
def process_dataset(path='SolPred/curated-solubility-dataset.csv', train_percentage=0.7, test_percentage=0.15, val_percentage=0.15):


  #Loads the database and takes out specific indices as labels and features. If changing dataset change those indexes
  def loaddataset(path):
      drive.mount('/content/drive/')
      with open('/content/drive/MyDrive/'+path, 'r') as f:
        l = csv.reader(f)
        data = np.array([list(filter(None,i)) for i in l])
        X = data[1:,9:-1]
        y = data[1:,5]
        return X.astype(np.double),y.astype(np.double)

  #Splits the data into training data, validation data, and testing data, and normalizes the data with respect to the training portion
  #Returns the X_train, y_train, X_val, y_val, X_test, y_val, train_mean, train_std
  def traintestsplit_and_normalize(train_p, test_p, val_p, features, labels):
    #Split data
    X_train, X_temp, y_train, y_temp = sklearn.model_selection.train_test_split(features,labels, train_size = train_p, shuffle=True )
    X_val, X_test, y_val, y_test = sklearn.model_selection.train_test_split(X_temp, y_temp, train_size = val_p/(val_p+test_p))

    X_train = np.asfarray(X_train)
    X_val = np.asfarray(X_val)
    X_test = np.asfarray(X_test)

    X_mean = np.mean(X_train, axis=0)
    X_std = np.std(X_train, axis=0)

    X_train = (X_train-X_mean)/X_std
    X_val = (X_val-X_mean)/X_std
    X_test = (X_test-X_mean)/X_std
    return X_train, y_train, X_val, y_val, X_test, y_test, X_mean, X_std
  print('-----------------------------------------------------')

  print('loading dataset...')
  print('-----------------------------------------------------')

  X,y = loaddataset(path)

  #print(X[1])
  print('num_samples, num_features', X.shape)
  print('labels', y.shape)

  print('-----------------------------------------------------')
  print('Splitting dataset into train/test/val and normalizing')
  print('-----------------------------------------------------')

  X_train, y_train, X_val, y_val, X_test, y_test, X_mean, X_std = traintestsplit_and_normalize(train_percentage,val_percentage,test_percentage,X,y)
  print("Training features mean: " + str(np.mean(X_train)) + ", std: " + str(np.std(X_train)) + ", shape: " + str(X_train.shape))
  print("Validation features mean: " + str(np.mean(X_val)) + ", std: " + str(np.std(X_val)) + ", shape: " + str(X_val.shape))
  print("Testing features mean: " + str(np.mean(X_test)) + ", std: " + str(np.std(X_test)) + ", shape: " + str(X_test.shape))

  return X_train, y_train, X_val, y_val, X_test, y_test, X_mean, X_std

#Processes dataset with default values
X_train, y_train, X_val, y_val, X_test, y_test, mean, std = process_dataset()

-----------------------------------------------------
loading dataset...
-----------------------------------------------------
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
num_samples, num_features (9982, 19)
labels (9982,)
-----------------------------------------------------
Splitting dataset into train/test/val and normalizing
-----------------------------------------------------
Training features mean: -5.310618234022367e-16, std: 0.9999999999999959, shape: (6987, 19)
Validation features mean: -0.0025389798364047877, std: 0.9022766022749542, shape: (1497, 19)
Testing features mean: -0.012837901941694582, std: 1.055637237674046, shape: (1498, 19)
