In [15]:
%matplotlib inline

import math
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.tri as tri
import matplotlib.ticker as ticker

import copy
import datetime
import glob
import GPyOpt
import GPy
import numpy as np
import os
import pandas as pd
import pickle
import pyDOE
import re
import seaborn as sns
import random
import ternary
import time

from collections import Counter
from scipy.stats import norm
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

In [16]:
from IPython.display import clear_output
from IPython.core.display import HTML
HTML("<style>.container { width:98% !important; }</style>")

## Load Dataset

In [17]:
data_dir = "../datasets/files"
dataset_name = 'Polymer_v4' # ['Crossed barrel', 'Perovskite', 'AgNP', 'P3HT', 'AutoAM', 'Polymer_v4']

# Define the parameters
version = "v2,27"
materials = ["LiTFSI", "NaTFSI"]
print_types = ["PCB", "coin"]

# Create a list to hold all DataFrames
df_list = []

# Iterate through all combinations of version, material, and print_type
for material in materials:
    for print_type in print_types:
        # Construct the filename pattern
        filename_pattern = f"PE_MLtraining_{version}_{material}_{print_type}.csv"
        
        # Construct the full path pattern
        full_path_pattern = os.path.join(data_dir, filename_pattern)
        
        # Use glob to find matching files
        matching_files = glob.glob(full_path_pattern)
        
        # Check if any files matched
        if not matching_files:
            print(f"No files found for pattern: {filename_pattern}")
            continue  # Skip to the next combination if no files are found
        
        # Iterate through all matching files (in case there are multiple)
        for file in matching_files:
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file)
                
                # Add 'material' and 'print_type' columns
                df['material'] = material
                df['print_type'] = print_type
                
                # Append the DataFrame to the list
                df_list.append(df)
                
                print(f"Successfully added data from: {file}")
            
            except Exception as e:
                print(f"Error reading {file}: {e}")

Successfully added data from: ../datasets/files/PE_MLtraining_v2,27_LiTFSI_PCB.csv
Successfully added data from: ../datasets/files/PE_MLtraining_v2,27_LiTFSI_coin.csv
Successfully added data from: ../datasets/files/PE_MLtraining_v2,27_NaTFSI_PCB.csv
No files found for pattern: PE_MLtraining_v2,27_NaTFSI_coin.csv


In [18]:
target = "ionic_conductivity_final"

# Check if any DataFrames were collected
if df_list:
    # Concatenate all DataFrames in the list
    df = pd.concat(df_list, ignore_index=True)
    
    print("All datasets have been successfully integrated into a single DataFrame.")
    print(f"Combined DataFrame shape: {df.shape}")
else:
    print("No data was collected. Please check the file patterns and directory path.")

# Define the columns to keep
columns_to_keep = [
    "material",
    "print_type",
    "fcomp_additive_wt_pct",
    "fcomp_salt_wt_pct",
    "ionic_conductivity_final",
]

# Check which columns are present in the combined DataFrame
existing_columns_to_keep = [col for col in columns_to_keep if col in df.columns]

# Remove dummy columns by selecting only the columns to keep
df = df[existing_columns_to_keep].copy()

print("Dummy columns have been removed.")
print(f"Cleaned DataFrame shape: {df.shape}")

df

All datasets have been successfully integrated into a single DataFrame.
Combined DataFrame shape: (1659, 7)
Dummy columns have been removed.
Cleaned DataFrame shape: (1659, 5)


Unnamed: 0,material,print_type,fcomp_additive_wt_pct,fcomp_salt_wt_pct,ionic_conductivity_final
0,LiTFSI,PCB,5.0,0.0,2.000000e-07
1,LiTFSI,PCB,5.0,0.0,2.735043e-07
2,LiTFSI,PCB,5.0,0.0,1.099656e-07
3,LiTFSI,PCB,5.0,0.0,5.693950e-08
4,LiTFSI,PCB,5.0,0.0,7.424594e-08
...,...,...,...,...,...
1654,NaTFSI,PCB,8.6,20.0,2.962963e-06
1655,NaTFSI,PCB,8.6,20.0,1.711230e-06
1656,NaTFSI,PCB,8.6,20.0,2.424242e-06
1657,NaTFSI,PCB,8.6,20.0,3.382664e-06


In [40]:
df.to_csv(os.path.join(data_dir, "{}_dataset.csv".format(dataset_name)), index=False)