In [1]:
## comment

# explore the data from OCP

In [2]:
## import

import pandas as pd
import numpy as np
import random
import ray
from itertools import combinations
import torch.nn as nn
import torch
import math
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import sys
import os
import logging
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
import gc
from pprint import pprint

2023-10-03 20:48:07,874	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
## variables 

curdir = ''
logging.basicConfig(
    filename=curdir + 'logger.log', 
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
trial_no = 0
seed = 42 + trial_no
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [4]:
####################################
## ocp
####################################

In [5]:
df_ocp1 = pd.read_parquet(curdir + 'datasets/ocp_reactions_info_df.parquet')

In [6]:
df_ocp1.head(2)

Unnamed: 0,bulk_id,ads_id,bulk_mpid,bulk_symbols,ads_symbols,miller_index,shift,top,adsorption_site,class,...,p758,p759,p760,p761,p762,p763,p764,p765,p766,p767
0,2085,29,mp-976273,Hf6Ge4,*COCH2O,"[2, 1, 0]",0.022,True,"[[9.3, 4.58, 26.89]]",1,...,-0.698056,0.273536,0.742442,0.61816,0.486411,-0.93435,0.204278,0.679431,0.341144,-0.852883
1,10724,20,mp-1247259,Ca6Rh2N6,*CHCO,"[1, 0, 0]",0.312,True,"[[10.54, 1.25, 22.82]]",2,...,-0.625508,-0.456795,0.304594,1.134157,-0.394405,-1.171003,-0.648138,-0.02155,-1.443121,0.004959


In [7]:
def convert_to_int(x):
    try:
        return str(''.join(map(str, x)))
    except ValueError:
        print(f"Error with value: {x}")
        logging.info(f"Error with value: {x}")
        return x

df_ocp1.rename(columns={'energy': 'nre'}, inplace=True)
df_ocp1['miller_index'] = df_ocp1['miller_index'].apply(convert_to_int)
list_scols = ['s' + str(i) for i in range(9)]
list_pcols = ['p' + str(i) for i in range(768)]
list_cols = ['anomaly', 'bulk_mpid', 'miller_index'] + list_scols + list_pcols + ['nre']

In [8]:
list_pkey = sorted(list(df_ocp1['pkey'].unique()))

In [9]:
# list_pkey

In [10]:
df_ocp1 = df_ocp1[list_cols]

In [11]:
print(df_ocp1.shape)

(437781, 781)


In [12]:
print(df_ocp1.columns)

Index(['anomaly', 'bulk_mpid', 'miller_index', 's0', 's1', 's2', 's3', 's4',
       's5', 's6',
       ...
       'p759', 'p760', 'p761', 'p762', 'p763', 'p764', 'p765', 'p766', 'p767',
       'nre'],
      dtype='object', length=781)


In [13]:
# 1. Data Preparation:

In [14]:
cases = {
    'case_1': [0],
    'case_2': [0,1],
    'case_3': [0,1,2],
    'case_4': [0,1,2,3],
    'case_5': df_ocp1['anomaly'].unique()
}
sample_counts = {}
filtered_dfs = {}
for case, values in cases.items():
    filtered_df = df_ocp1[df_ocp1['anomaly'].isin(values)]
    filtered_dfs[case] = filtered_df
    sample_counts[case] = len(filtered_df)
print(sample_counts)

{'case_1': 325743, 'case_2': 402184, 'case_3': 418753, 'case_4': 433989, 'case_5': 437781}


In [15]:
cases = {
    'case_1': [0],
    'case_2': [0,1],
    'case_3': [0,1,2],
    'case_4': [0,1,2,3],
    'case_5': df_ocp1['anomaly'].unique()
}

sample_counts = {}
filtered_dfs = {}
for case, values in cases.items():
    filtered_df = df_ocp1[df_ocp1['anomaly'].isin(values)]
    
    # Assuming 'anomaly' column contains the values for which you want to detect outliers
    mean_val = filtered_df['nre'].mean()
    std_val = filtered_df['nre'].std()

    # Filter rows
    print(mean_val, std_val)
    no_outliers_df = filtered_df[
        (filtered_df['nre'] >= mean_val - 1*std_val) &
        (filtered_df['nre'] <= mean_val + 1*std_val)
    ]
    
    filtered_dfs[case] = no_outliers_df
    sample_counts[case] = len(no_outliers_df)

print(sample_counts)

-1.4096926886057743 2.043792159537874
-1.600708582462779 2.270462035025037
-1.587948442131603 2.2734108175746686
-1.5810491737774808 2.2811852803293022
-1.5607516413861389 2.2868426774779977
{'case_1': 226270, 'case_2': 287317, 'case_3': 301206, 'case_4': 311997, 'case_5': 313744}


In [16]:
# 2. Preprocess Data:

In [17]:
# Function to preprocess data
def preprocess_data(train, test, use_pca=False):
    features = list(train.columns[3:-1])  # s0 to p767
    X_train = train[features]
    X_test = test[features]
    y_train = train['nre']
    y_test = test['nre']
    
    scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test    

    if use_pca:
        pca = PCA(0.9)
        X_train_scaled = pca.fit_transform(X_train_scaled)
        X_test_scaled = pca.transform(X_test_scaled)
    return X_train_scaled, X_test_scaled, y_train, y_test

In [18]:
# 3. Data Splitting based on groups:

In [19]:
# Define a function to split based on groups
def split_data(df, group_columns):
    gss = GroupShuffleSplit(n_splits=1, train_size=0.75, random_state=seed)
    for train_idx, test_idx in gss.split(df, groups=df[group_columns].astype(str).apply(lambda x: '_'.join(x), axis=1)):
        train = df.iloc[train_idx]
        test = df.iloc[test_idx]
    return train, test

train_test_splits = {}
for case, filtered_df in filtered_dfs.items():
    train, test = split_data(filtered_df, ['bulk_mpid', 'miller_index'])
    train_test_splits[case] = (train, test)
    
    # Calculate unique groups for total, train, and test datasets
    total_groups = filtered_df[['bulk_mpid', 'miller_index']].drop_duplicates().shape[0]
    train_groups = train[['bulk_mpid', 'miller_index']].drop_duplicates().shape[0]
    test_groups = test[['bulk_mpid', 'miller_index']].drop_duplicates().shape[0]

    # Print the results
    print(f"Case: {case}")
    print(f"Shape: {train.shape}, {test.shape}")    
    print(f"Total groups: {total_groups}")
    print(f"Training groups: {train_groups}")
    print(f"Testing groups: {test_groups}\n")    

Case: case_1
Shape: (169769, 781), (56501, 781)
Total groups: 72315
Training groups: 54236
Testing groups: 18079

Case: case_2
Shape: (215958, 781), (71359, 781)
Total groups: 78336
Training groups: 58752
Testing groups: 19584

Case: case_3
Shape: (227129, 781), (74077, 781)
Total groups: 79694
Training groups: 59770
Testing groups: 19924

Case: case_4
Shape: (234094, 781), (77903, 781)
Total groups: 82254
Training groups: 61690
Testing groups: 20564

Case: case_5
Shape: (236002, 781), (77742, 781)
Total groups: 82383
Training groups: 61787
Testing groups: 20596



In [20]:
# 4. Model Training:

In [21]:
results = {}

In [22]:
# Models & hyperparameters
models = {
    'Ridge Regression': {
        'model': Ridge(),
        'params': {
            'alpha': [1.0],
        }
    },
    'Elastic Regression': {
        'model': ElasticNet(),
        'params': {
            'alpha': [1.0],
            'l1_ratio': [0.5]
        }
    },
    # 'Kernel Ridge Regression': {
    #     'model': KernelRidge(),
    #     'params': {
    #         'alpha': [1],
    #         'kernel': ['linear'],
    #         'degree': [3]
    #     }
    # },
    # 'Support Vector Regression': {
    #     'model': SVR(),
    #     'params': {
    #         'C': [1.0],
    #         'kernel': ['rbf'],
    #         'gamma': ['scale']
    #     }
    # }
}

In [23]:
# Create a function to plot and save the figures
def plot_and_save(predicted, true, case, model_name, transformation, folder='outputs'):
    plt.figure(figsize=(5, 3))
    plt.scatter(true, predicted, alpha=0.5)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.title(f"{case}: {model_name} ({transformation})")
    plt.grid(True)
    
    # Save the figure
    # Ensure the 'outputs' directory exists
    if not os.path.exists(curdir + folder):
        os.makedirs(curdir + folder)
    
    plt.savefig(f"{curdir}{folder}/{case}D_{model_name}_{transformation}.png")
    plt.close()

In [24]:
# Train and evaluate models
selected_cases = ['case_1', 'case_2', 'case_3', 'case_4', 'case_5'] 
for case, (train, test) in train_test_splits.items():
    if case not in selected_cases:  # Skip the cases not in selected_cases
        continue    
    print(f"case: {case}")
    
    #############################################################################################################################
    X_train, X_test, y_train, y_test = preprocess_data(train, test, use_pca=False) # For non-PCA version
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    for model_name, model_details in models.items():
        print(model_name)
        clf = GridSearchCV(model_details['model'], model_details['params'], cv=5, scoring='neg_mean_absolute_error', n_jobs=1)
        clf.fit(X_train, y_train)    
        predicted = clf.predict(X_test)
        mae = -clf.score(X_test, y_test)
        params = clf.best_params_
        mean_target = y_test.abs().mean()
        # Store the results
        results[(case, model_name, 'non_pca')] = (mae, mean_target, params)
        
        # Plot
        plot_and_save(predicted, y_test, case, model_name, 'non_pca')
        del predicted
    del X_train, X_test, y_train, y_test
    gc.collect()
    
    #############################################################################################################################        
    X_train, X_test, y_train, y_test = preprocess_data(train, test, use_pca=True) # For PCA version
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)    
    for model_name, model_details in models.items():
        print(model_name)
        clf = GridSearchCV(model_details['model'], model_details['params'], cv=5, scoring='neg_mean_absolute_error', n_jobs=1)
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        mae = -clf.score(X_test, y_test)
        params = clf.best_params_
        mean_target = y_test.abs().mean()
        # Store the results
        results[(case, model_name, 'pca')] = (mae, mean_target, params)
        
        # Plot
        plot_and_save(predicted, y_test, case, model_name, 'pca')
        del predicted
        
    del X_train, X_test, y_train, y_test
    gc.collect()

case: case_1
(169769, 777) (56501, 777) (169769,) (56501,)
Ridge Regression
Elastic Regression
(169769, 2) (56501, 2) (169769,) (56501,)
Ridge Regression
Elastic Regression
case: case_2
(215958, 777) (71359, 777) (215958,) (71359,)
Ridge Regression
Elastic Regression
(215958, 2) (71359, 2) (215958,) (71359,)
Ridge Regression
Elastic Regression
case: case_3
(227129, 777) (74077, 777) (227129,) (74077,)
Ridge Regression
Elastic Regression
(227129, 2) (74077, 2) (227129,) (74077,)
Ridge Regression
Elastic Regression
case: case_4
(234094, 777) (77903, 777) (234094,) (77903,)
Ridge Regression
Elastic Regression
(234094, 2) (77903, 2) (234094,) (77903,)
Ridge Regression
Elastic Regression
case: case_5
(236002, 777) (77742, 777) (236002,) (77742,)
Ridge Regression
Elastic Regression
(236002, 2) (77742, 2) (236002,) (77742,)
Ridge Regression
Elastic Regression


In [25]:
pprint(results)

{('case_1', 'Elastic Regression', 'non_pca'): (0.9072895029024735,
                                               1.6065797755235314,
                                               {'alpha': 1.0, 'l1_ratio': 0.5}),
 ('case_1', 'Elastic Regression', 'pca'): (0.9072186601769682,
                                           1.6065797755235314,
                                           {'alpha': 1.0, 'l1_ratio': 0.5}),
 ('case_1', 'Ridge Regression', 'non_pca'): (2.5736615343264986,
                                             1.6065797755235314,
                                             {'alpha': 1.0}),
 ('case_1', 'Ridge Regression', 'pca'): (0.9072186601769682,
                                         1.6065797755235314,
                                         {'alpha': 1.0}),
 ('case_2', 'Elastic Regression', 'non_pca'): (1.0023311206008942,
                                               1.781291733359914,
                                               {'alpha': 1.0, 'l1_ratio': 0.5