# Pre-Processing

Importing Libraries

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

# Helper Functions to keep notebook clean
import functions as func

Define file and folder paths

In [2]:
PATH = os.getcwd() + "/"

In [3]:

folder_path = "DATA\\feat\\"

imported_data, descriptors = func.load_feat(folder_path)

In [5]:
print(descriptors)

['Composition_based_feature_vectors', 'Ewald_Site_Energy', 'Smooth_Overlap_of_Atomic_Positions', 'Structural_Heterogenity']


## Normalizing data & Splitting Data

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

In [30]:

def split_normalize(data, y_ind):
    scaler = StandardScaler()
    split_list = []
    for df in data:
        # Set first row as index
        df = df.set_index(df.iloc[:,0])
        # Split data and select y_ind column as target
        y = df.iloc[:,y_ind]
        X = df.drop(df.iloc[:,0:4],axis=1)

        # Split into training and test
        X_tr_un, X_te_un, y_tr, y_te = train_test_split(X, y, test_size=0.33, random_state=42)
        X_tr = scaler.fit_transform(X_tr_un)
        X_te = scaler.transform(X_te_un)
        X_tr = normalize(X_tr)
        X_te = normalize(X_te)
        yXs_tt = [y_tr, y_te, X_tr, X_te]
        split_list.append(yXs_tt)
    return split_list

The `y_ind` parameter can be chosen to be either of the 3 NTE classifiers (1: directional NTE, 2: large directional NTE, 3: volumetric NTE)

In [31]:
y_ind = 3

list_split = split_normalize(imported_data, y_ind)

## Over-/Undersampling

In [34]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [41]:

def sample(data, desc):
    list_sampled = []
    i = 0
    for list in data:
        print("--- Sampling for: " + str(desc[i]) + " ---")
        y = list[0]
        X = list[2]

        # summarize class distribution
        print(Counter(y))
        # define oversampling strategy
        over = RandomOverSampler(sampling_strategy=0.5)
        # fit and apply the transform
        X, y = over.fit_resample(X, y)
        # summarize class distribution
        print(Counter(y))
        # define undersampling strategy
        under = RandomUnderSampler(sampling_strategy=0.8)
        # fit and apply the transform
        X, y = under.fit_resample(X, y)
        # summarize class distribution
        print(Counter(y))
        list_sampled.append([y, list[1], X, list[3]])
        i += 1
    return list_sampled

In [42]:
list_sampled = sample(split_list, descriptors)

--- Sampling for: Composition_based_feature_vectors ---
Counter({0.0: 1812, 1.0: 129})
Counter({0.0: 1812, 1.0: 906})
Counter({0.0: 1132, 1.0: 906})
--- Sampling for: Ewald_Site_Energy ---
Counter({0.0: 955, 1.0: 97})
Counter({0.0: 955, 1.0: 477})
Counter({0.0: 596, 1.0: 477})
--- Sampling for: Smooth_Overlap_of_Atomic_Positions ---
Counter({0.0: 1112, 1.0: 110})
Counter({0.0: 1112, 1.0: 556})
Counter({0.0: 695, 1.0: 556})
--- Sampling for: Structural_Heterogenity ---
Counter({0.0: 1091, 1.0: 103})
Counter({0.0: 1091, 1.0: 545})
Counter({0.0: 681, 1.0: 545})


In [44]:
display(np.shape(list_sampled))

(4, 4)

In [45]:
data_final = 'DATA/pickle/data_final.pkl'
data_descriptors = 'DATA/pickle/descriptors.pkl'

with open(data_final, 'wb') as f:
    pickle.dump(list_sampled, f)

with open(data_descriptors, 'wb') as g:
    pickle.dump(descriptors, g)