# Import packages

In [40]:
import pickle  # Local Python (3.8) is fine with this. If you're suing Google
# colab, which uses a Python version of 3.6, you need to do import pickel5
# as pickle
import cloudpickle as cp
from urllib.request import urlopen

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from collinearity import SelectNonCollinear

from sklearn.decomposition import PCA
from sklearn.svm import SVR, SVC

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, \
    StratifiedKFold, RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    r2_score, make_scorer, recall_score, accuracy_score, f1_score, \
    precision_score, balanced_accuracy_score, roc_curve, auc

# Custom functions

In [2]:
from dataset_expansion import dataset_feature_expansion, merge_multiple_dfs
from dataset_cleanup import filter_low_variance
from dataset_plot import simple_pie_plot
from pickle_managment import save_pickle, load_pickle



Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


# Convert and clean .tsv files to .csv files

## Regression

In [None]:
regression_tsv = 'datasets\original_datasets\B3DB_regression.tsv'
regression_df = read_tsv_to_df(regression_tsv)
regression_df

In [None]:
regression_df_cleaned = regression_df[
    [
        'SMILES',
        'logBB'
    ]
]

regression_df_cleaned

In [None]:
regression_df_cleaned.to_csv(
    'datasets\cleaned_datasets\BBB_regression.csv',
    index=False
)

print('Done!')

## Classification

In [None]:
classification_tsv = 'datasets\original_datasets\B3DB_classification.tsv'
classification_df = read_tsv_to_df(classification_tsv)
classification_df

In [None]:
classification_df_cleaned = classification_df[
    [
        'SMILES',
        'BBB+/BBB-'
    ]
]

classification_df_cleaned

In [None]:
classification_df_cleaned.to_csv(
    'datasets\cleaned_datasets\BBB_classification.csv',
    index=False
)

print('Done!')

# Read in data

## Regression

In [None]:
regression_df = pd.read_csv('datasets\cleaned_datasets\BBB_regression.csv')
regression_df

In [None]:
(regression_df['logBB'] <= -1.01).sum()  #These are BBB-

In [None]:
(regression_df['logBB'] >= -1).sum()  #These are BBB+

## Classification

In [None]:
classification_df = pd.read_csv(
    'datasets\cleaned_datasets\BBB_classification.csv'
)
classification_df

In [None]:
classification_df['BBB+/BBB-'].value_counts()

# Dataset expansion & cleaning
Major expansion steps:
1. Add in RDKit descriptors
2. Add in Morgan fingerprints
3. Add in MACCS keys

Major cleaning steps:
1. Remove columns whose variance is 0--all values are hte same
    * Done by a function so later the threshold for filtering
    based on variance level is adjustable

## Regression

In [None]:
regression_df_expanded, regression_expansion_errors = dataset_feature_expansion(
    regression_df)
regression_df_expanded  #Missing SMILES are the chemicals that have errors
# when going through the calculations

In [None]:
regression_df_expanded_cleaned = filter_low_variance(
    regression_df_expanded,
    exclude_col_list=['SMILES', 'logBB'],
    threshold_level=0
)
regression_df_expanded_cleaned

In [None]:
regression_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_regression_expanded.csv.zip',
    index=False,
    compression='zip'  # Have to use zip here since the classification
    # dataset will become very large. Zipped .csv files can be directly read
    # by pd.read_csv()
)
print('Done!')

## Classification

In [None]:
classification_df_expanded, classification_expansion_errors = dataset_feature_expansion(
    classification_df)
classification_df_expanded

In [None]:
classification_df_expanded_cleaned = filter_low_variance(
    classification_df_expanded,
    exclude_col_list=['SMILES', 'BBB+/BBB-'],
    threshold_level=0
)
classification_df_expanded_cleaned

In [None]:
classification_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

# Dataset rebalance
Before center and standardization

## Regression
Regression dataset doesn't need rebalancing

## Classification

In [None]:
classification_df_expanded_cleaned = pd.read_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv.zip')
classification_df_expanded_cleaned

In [None]:
classification_BBB_N = (classification_df_expanded_cleaned
[classification_df_expanded_cleaned['BBB+/BBB-'] == 'BBB-']).shape[0]  #These
# are BBB-
classification_BBB_Y = (classification_df_expanded_cleaned
[classification_df_expanded_cleaned['BBB+/BBB-'] == 'BBB+']).shape[0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset before balancing'
)
plt.show()

In [None]:
X = classification_df_expanded_cleaned.loc[
    :,
    ~classification_df_expanded_cleaned.columns.isin(['SMILES', 'BBB+/BBB-'])
    ]
y = classification_df_expanded_cleaned['BBB+/BBB-']

### Under-sampling by ClusterCentroids

In [None]:
cluster_centroids = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1, random_state=1),
    random_state=1
)

X_resample, y_resample = cluster_centroids.fit_resample(X, y)

classification_df_after_centroid_balancing = merge_multiple_dfs(
    df_list=[classification_df_expanded_cleaned['SMILES'], y_resample,
             X_resample])
classification_df_after_centroid_balancing

In [None]:
classification_BBB_N = (classification_df_after_centroid_balancing
[classification_df_after_centroid_balancing['BBB+/BBB-'] == 'BBB-']).shape[
    0]  #These
# are BBB-
classification_BBB_Y = (classification_df_after_centroid_balancing
[classification_df_after_centroid_balancing['BBB+/BBB-'] == 'BBB+']).shape[
    0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset after '
              'balancing by centroids method'
)
plt.show()

In [None]:
classification_df_after_centroid_balancing.to_csv(
    r'datasets\balanced_datasets\BBB_classification_balanced_centroid.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

### Over-sample by SMOTE then cleaning using ENN
Not using SMOTE only to create lots of hypothetical chemicals that might
not exist

In [None]:
smoteenn = SMOTEENN(random_state=1)

X_resample, y_resample = smoteenn.fit_resample(X, y)

classification_df_after_smoteenn_balancing = merge_multiple_dfs(
    df_list=[classification_df_expanded_cleaned['SMILES'], y_resample,
             X_resample])
classification_df_after_smoteenn_balancing

In [None]:
classification_BBB_N = (classification_df_after_smoteenn_balancing
[classification_df_after_smoteenn_balancing['BBB+/BBB-'] == 'BBB-']).shape[
    0]  #These
# are BBB-
classification_BBB_Y = (classification_df_after_smoteenn_balancing
[classification_df_after_smoteenn_balancing['BBB+/BBB-'] == 'BBB+']).shape[
    0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset after '
              'balancing by SMOTE-ENN method'
)
plt.show()

In [None]:
classification_df_after_smoteenn_balancing.to_csv(
    r'datasets\balanced_datasets\BBB_classification_balanced_smoteenn.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')