### **FUNCTION SPLIT - DATA PREPROCESSING**

In [None]:
### Import thư viện
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd
import copy

### Hàm chia dữ liệu cho train - test - val
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8 *len(df))])

### Hàm chuẩn hóa dữ liệu - chia dữ liệu
### dataframe là (train, test, valid)
def getX_y_scaler(dataframe, y_label, x_labels=None, oversample=False):
  dataframe = copy.deepcopy(dataframe)
  if x_labels is None:
    X = dataframe[[c for c in dataframe.columns if c != y_label]].values
  else:
    if len(x_labels)==1:
      X = dataframe[x_labels[0]].values.reshape(-1,1)
    else:
      X = dataframe[x_labels].values
  y = dataframe[y_label].values
  scalar = StandardScaler()
  X = scalar.fit_transform(X)
  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X,np.reshape(y, (-1,1))))
  return data, X,y

###**SOLVE THE MISSING VALUES WITH MEAN**

In [None]:
### Import thư viện
from sklearn.impute import SimpleImputer

### Khởi tạo hàm
def solve_missing_values(dataframe):
  cols = []
  for col in dataframe.columns:
    missing_data = dataframe[col].isnull().sum()
    if missing_data > 0:
      cols.append(col)
  for i in cols:
    imputer = SimpleImputer(strategy='mean')
    dataframe[i] = imputer.fit_transform(dataframe[i].values.reshape(-1,1))
    dataframe[i] = round(dataframe[i], 3)
  data = dataframe
  return data

###**SCORE OF MODEL**

In [None]:
### Import thư viện tính điểm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score

### Khởi tạo hàm
def score_model(model, X_score, y_score):
  print(model)
  print('Score: ', model.score(X_score, y_score))
  print("MSE: ", mean_squared_error(y_score, model.predict(X_score)))
  print("MAE: ", mean_absolute_error(y_score, model.predict(X_score)))
  print("R2: ", r2_score(y_score, model.predict(X_score)))
  print("Accuracy: ", accuracy_score(y_score, model.predict(X_score)))
  print("F1: ", f1_score(y_score, model.predict(X_score)))

###**PLOT PREDICTIONS VALUES, USE WHEN ALL ARE SAME SHAPE**

In [None]:
### Import thư viện
import matplotlib.pyplot as plt

### Khởi tạo hạm define
def plot_predictions(train_data, train_labels, test_data, test_labels, predictions):
    plt.figure(figsize=(10,7))
    plt.scatter(train_data, train_labels, c="b", label="Training data")
    plt.scatter(test_data, test_labels, c="g", label="Testing data")
    plt.scatter(test_data, predictions, c="r", label="Preditions")
    plt.legend();

###**TRAIN _ TEST_ VAL**
**MANUAL SPLIT**

In [None]:
### Import thư viện
import tensorflow as tf

### Chuyển thành tf data
tf_data = tf.constant(df)
tf_data = tf.cast(tf_data, tf.float32)
tf_data = tf.random.shuffle(tf_data)
X = tf_data[:, 3:-1]
y = tf_data[:,-1]
y = tf.expand_dims(y, axis = -1)

### Define để chia dữ liệu
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1
DATASET_SIZE = len(X)

X_train = X[:int(DATASET_SIZE*TRAIN_RATIO)]
y_train = y[:int(DATASET_SIZE*TRAIN_RATIO)]
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size = 8, reshuffle_each_iteration = True).batch(32).prefetch(tf.data.AUTOTUNE)

X_val = X[int(DATASET_SIZE*TRAIN_RATIO):int(DATASET_SIZE*(TRAIN_RATIO+VAL_RATIO))]
y_val = y[int(DATASET_SIZE*TRAIN_RATIO):int(DATASET_SIZE*(TRAIN_RATIO+VAL_RATIO))]
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = train_dataset.shuffle(buffer_size = 8, reshuffle_each_iteration = True).batch(32).prefetch(tf.data.AUTOTUNE)

X_test = X[int(DATASET_SIZE*(TRAIN_RATIO+VAL_RATIO)):]
y_test = y[int(DATASET_SIZE*(TRAIN_RATIO+VAL_RATIO)):]
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = train_dataset.shuffle(buffer_size = 8, reshuffle_each_iteration = True).batch(32).prefetch(tf.data.AUTOTUNE)