#### Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

#### Path

In [None]:
data_path = r'tested_molecules.csv'

#### Load data

In [None]:
df = pd.read_csv(data_path)

# Check if there is any missing data and duplicated rows
missing_count = df.isnull().sum()
print("Number of missing values in each column:\n", missing_count)
any_missing_values = df.isnull().values.any()
print("Are there any missing values in the DataFrame?",any_missing_values)

# Check for infinite values
inf_values = df.isin([np.inf, -np.inf]).sum()
print("Number of infinite values in each column:")
print(inf_values)

# Count the number of duplicated rows
duplicate_count = df.duplicated().sum()
print("Number of duplicated rows:", duplicate_count)

# Check which rows only contain one value
single_value_columns = df.columns[df.nunique() == 1]
print("Column containing only one value:", single_value_columns)
# Remove the columns with only one unique value from the DataFrame
df = df.drop(columns=single_value_columns)

# Create dataframes for SMILES, PKM2 and ERK2
smiles = df['SMILES']
PKM2_inhibition = df['PKM2_inhibition']
ERK2_inhibition = df['ERK2_inhibition']
features = df.iloc[:, 3::]

### Distribution of data

#### Correlation data

In [None]:
# Compute the correlation matrix
correlation_matrix = features.corr()

# Threshold correlation
correlation_value = 0.95
# Create a mask for correlations greater than the threhold
highly_correlated_mask = np.abs(correlation_matrix) > correlation_value
# Set the diagonal values to FALSE
np.fill_diagonal(highly_correlated_mask.values, False)
# Extract the upper triangle of the correlation matrix
upper_triangle_mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
# Apply the upper triangle mask to the highly correlated mask
highly_correlated_upper = highly_correlated_mask & upper_triangle_mask
# Count the number of highly correlated pairs
highly_correlated_count = highly_correlated_upper.sum()

print(f"Total correlated variable pairs (correlation > {correlation_value}):", highly_correlated_count.sum())

# Generate a heatmap
plt.figure(figsize=(40,40))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

#### Visualization distribution features

In [None]:
df_melted = features.melt(var_name='Feature', value_name='Value')

# Create the boxplot
# plt.figure(figsize=(10, 6))
# sns.boxplot(x='Feature', y='Value', data=df_melted)
# plt.title('Boxplot of Features')
# plt.xlabel('Feature')
# plt.ylabel('Value')
# plt.show()

# Create histograms
# g = sns.FacetGrid(df_melted, col='Feature', col_wrap=5, sharex=False, sharey=False)
# g.map(plt.hist, 'Value', bins=20)
# g.fig.subplots_adjust(top=0.9)
# g.fig.suptitle('Histograms of Features')
# plt.show()

# Create density plots
g = sns.FacetGrid(df_melted, col='Feature', col_wrap=5, sharex=False, sharey=False)
g.map(sns.kdeplot, 'Value')
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Density Plots of Features')
plt.show()

### Principal Component Analysis