In [1]:
import numpy as np
import os
import pandas as pd
np.random.seed(42)

### 1. Undersampling QMD data

In [2]:
# function for loading data
def load_input_data(path):
    csv_path = os.path.join(path)
    return pd.read_csv(csv_path, header=None)

# Load QMD data obtained in 2019 and 2020
QMD_2019 = load_input_data("data/QMD/QMD_2019.csv")
QMD_2020 = load_input_data("data/QMD/QMD_2020.csv")

In [3]:
# Undersample QMD data respectively
QMD_2019_samples = QMD_2019.sample(frac=0.75, random_state=42)
QMD_2020_samples = QMD_2020.sample(frac=0.7505, random_state=42)
QMD_train = QMD_2019_samples.append(QMD_2020_samples, ignore_index=True).sample(frac=1, random_state=42)
# Assign target vector
QMD_train['Target']='QMD'

### 2. Oversampling GA, HO, LO data using SMOTE

In [4]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [5]:
# Load GA, HO, and LO data
GA_train = load_input_data("data/GA/GA_train.csv")
HO_train = load_input_data("data/HO/HO_train.csv")
LO_train = load_input_data("data/LO/LO_train.csv")

# Assign target vector (type of the lithology)
GA_train['Target']='GA'
HO_train['Target']='HO'
LO_train['Target']='LO'

In [6]:
# Combine all data
train_data = QMD_train.append(GA_train, ignore_index=True).append(LO_train, ignore_index=True).append(HO_train, ignore_index=True).sample(frac=1, random_state=42)
train_data.reset_index(drop=True, inplace = True)
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Target
0,0,22,31,33,29,30,9,10,2,1,...,0,0,0,0,0,0,0,0,0,LO
1,0,11,22,19,26,12,9,6,1,0,...,0,0,0,0,0,0,0,0,0,HO
2,0,15,28,25,34,16,10,5,3,0,...,0,0,0,0,0,0,0,0,0,GA
3,0,15,26,26,31,22,19,8,7,0,...,0,0,0,0,0,0,0,0,0,QMD
4,0,18,28,24,25,28,18,13,2,0,...,0,0,0,0,0,0,0,0,0,QMD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6149,0,23,26,31,33,15,21,9,5,1,...,0,0,0,0,0,0,0,0,0,LO
6150,0,12,21,20,19,21,7,4,2,1,...,0,0,0,0,0,0,0,0,0,HO
6151,0,9,26,16,13,10,17,6,0,0,...,0,0,0,0,0,0,0,0,0,HO
6152,0,18,27,22,30,22,8,4,4,1,...,0,0,0,0,0,0,0,0,0,HO


In [7]:
X_train = train_data.iloc[:,:2048]
y_train = train_data.iloc[:,2048]

In [8]:
sm = SMOTE(random_state=42)

In [9]:
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'LO': 1800, 'HO': 1800, 'GA': 1800, 'QMD': 1800})


In [10]:
X_res["Target"] = y_res
X_res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Target
0,0,22,31,33,29,30,9,10,2,1,...,0,0,0,0,0,0,0,0,0,LO
1,0,11,22,19,26,12,9,6,1,0,...,0,0,0,0,0,0,0,0,0,HO
2,0,15,28,25,34,16,10,5,3,0,...,0,0,0,0,0,0,0,0,0,GA
3,0,15,26,26,31,22,19,8,7,0,...,0,0,0,0,0,0,0,0,0,QMD
4,0,18,28,24,25,28,18,13,2,0,...,0,0,0,0,0,0,0,0,0,QMD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,0,19,17,24,27,17,16,12,3,1,...,0,0,0,0,0,0,0,0,0,LO
7196,0,16,24,28,33,21,14,10,2,0,...,0,0,0,0,0,0,0,0,0,LO
7197,0,17,23,24,19,22,14,3,2,0,...,0,0,0,0,0,0,0,0,0,LO
7198,0,16,23,27,29,19,9,7,3,0,...,0,0,0,0,0,0,0,0,0,LO


In [11]:
output_path = 'data/train_data.csv'

X_res.to_csv(output_path, index=False, header=True, mode='w')