# Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.colors as mcolors
from collections import OrderedDict
import tqdm
from tqdm import tqdm
from time import time
import time
import random
from pandas.plotting import table
from itertools import combinations
from itertools import compress
from itertools import chain
from itertools import product

from matplotlib.patches import Polygon

In [3]:
# Input dataset define pathway variables used later

# Define current directory as a variable
BASE_PATH = os.getcwd()

# Define a pathway to the "Data" folder
BASE_PATH2 = os.path.join(BASE_PATH, "Data")

# Define target variable names, units, and graph names
# property, property title, label, cmap color, unit
target_variable = [np.array(['property', 'Property', 'Property (unit)', 'viridis', 'unit']),
                   np.array(['costs', 'Cost', 'Cost per kg (USD)', 'viridis_r', 'USD']),
                   np.array(['efficiency', 'Cost Efficiency', 'Cost Efficiency (unit/USD)', 'viridis', 'unit/USD']),
                  np.array(['cost performance', 'Cost Performance', 'Cost Performance (Score)', 'viridis', 'Score'])]

# Define paths to the initial dataset, elemental data, and more
PATH2 = os.path.join(BASE_PATH2, "compositions.xlsx")
PATH4 = os.path.join(BASE_PATH, "element_data.xlsx")

In [4]:
# Define composition generation function

# Defines seed for reproducibility
np.random.seed(42)

# Defines function to generate compositions
def comp_gen(num, resolution, elements, min_comp, max_comp):
    # Starts a timer to track computation time
    tic = time.time()
    
    # Sort the input elements for consistent order
    elements = sorted(elements)

    # Total resolution
    n_comps = resolution
    
    # Number of elements in each composition system
    sys_d = num

    # Generate all combinations of the given elements taken 'num' at a time
    sys_list = []
    for sys in combinations(elements, sys_d):
        sys_list.append(list(sys))

    # Generate all compositions that sum to the total resolution (n_comps)
    comps = []
    indices = np.ndindex(*[n_comps for _ in range(sys_d)])
    j = 0
    for index in indices:
        j += 1
        comp = list(index)
        if sum(comp) == n_comps:
            comps.append(comp)
            toc = time.time()
            #print(f"{round(j/(n_comps**sys_d)*100,2)}% Done: Total Compositions = {len(comps)} in {round(toc-tic,3)} secs")

    # Normalize each composition to make it a fraction (sum to 1)
    comps = np.array(comps) / n_comps

    # Create a DataFrame for each element system with generated compositions
    results_df_list = []
    for si in range(len(sys_list)):
        s_els = sys_list[si]
        new_df = pd.DataFrame(comps, columns=s_els)
        results_df_list.append(new_df)

    # Combines all system DataFrames into one, filling in missing values with 0
    results_df = pd.concat(results_df_list, axis=0, ignore_index=True)
    results_df = results_df.fillna(0)
    results_df = results_df.drop_duplicates()
    results_df = results_df.reset_index(drop=True)

    # Identifies unique active element sets in each composition
    Els = []
    for row in range(results_df.shape[0]):
        comp = results_df.iloc[row][elements]
        active_el = list(compress(elements, list(comp > 0)))
        if active_el not in Els:
            Els.append(active_el)

    # Filters compositions to only include those where active elements sum to ~1
    results_df2_list = []
    for El_i in Els:
        cond = (np.all(results_df[El_i] > 0, axis=1)) & (np.sum(results_df[El_i], 1) > 1 - 1e-9) & (np.sum(results_df[El_i], 1) < 1 + 1e-9)
        filtered_df = results_df[cond]
        results_df2_list.append(filtered_df)
    
    # Combines all filtered compositions into a final DataFrame
    results_df2 = pd.concat(results_df2_list, axis=0, ignore_index=True)
    results_df2 = results_df2.reset_index(drop=True)
    
    return results_df2

In [6]:
# Fine tune and generate composition sections

# Defines resolution (Composition increments = 1/resolution, Ex: 1/200 = 0.005
# 200 compositions
resolution = 200

# Defines min and max weight percent for Ti
min_comp, max_comp = 0.48, 0.52

# Will generate 200 different compositions with elements "Ti" and "Other"
elements = ["Ti", "Other"]
num = len(elements)

# Generates compositions
df_compositions_Ti_ratio = comp_gen(num, resolution, elements, min_comp, max_comp)

# Divides weight percentages by 10 (200 was used to save computational time)
# Would have had to use 2000 resolution for increments of 0.0005
# Results in increments of 0.0005
df_compositions_Ti_ratio = df_compositions_Ti_ratio / 1

# Removes compositions outside the min and max weight percent
df_compositions_Ti_ratio = df_compositions_Ti_ratio[df_compositions_Ti_ratio['Ti'] >= min_comp]
df_compositions_Ti_ratio = df_compositions_Ti_ratio[df_compositions_Ti_ratio['Ti'] <= max_comp]
df_compositions_Ti = df_compositions_Ti_ratio['Ti']

# Saves compositions to a variable
columns_Ti = df_compositions_Ti_ratio['Ti']
df_compositions_Ti.to_excel('Ti.xlsx')
print('There are this many alloys in the space:', len(df_compositions_Ti))
pd.set_option('display.float_format', '{:.8f}'.format)
print(df_compositions_Ti.describe())
print(df_compositions_Ti)
if isinstance(df_compositions_Ti, pd.Series):
    df_compositions_Ti = df_compositions_Ti.to_frame()




# Same steps as Ti but for summation of Stabilizers
# Results in increments of 0.005 weight fraction
resolution = 200
min_comp, max_comp = 0.005, 0.05
elements = ["Stabilizers", "Other"]
num = len(elements)
df_compositions_stabilizers_ratio = comp_gen(num, resolution, elements, min_comp, max_comp)
df_compositions_stabilizers_ratio = df_compositions_stabilizers_ratio[df_compositions_stabilizers_ratio['Stabilizers'] >= min_comp]
df_compositions_stabilizers_ratio = df_compositions_stabilizers_ratio[df_compositions_stabilizers_ratio['Stabilizers'] <= max_comp]

# Results in 10 compositions
df_compositions_stabilizers_ratio.to_excel('Stabilizers.xlsx')
print('There are this many alloys in the space:', len(df_compositions_stabilizers_ratio))
pd.set_option('display.float_format', '{:.8f}'.format)
print(df_compositions_stabilizers_ratio.describe())
print(df_compositions_stabilizers_ratio)

# Resolution of 4 results in increments of 0.25
resolution = 4
min_comp, max_comp = 0.005, 0.05
elements = ['Pd', 'Pt', 'Al']
num = len(elements)
df_compositions_stabilizers = comp_gen(num, resolution, elements, min_comp, max_comp)

# Adds compositions with one element containing 1 weight fraction (100%) and the rest 0, repeated for each element
# Identity matrix
identity_matrix = np.eye(len(df_compositions_stabilizers.columns))
identity_matrix = pd.DataFrame(identity_matrix, columns=df_compositions_stabilizers.columns)
df_compositions_stabilizers = pd.concat([df_compositions_stabilizers, identity_matrix], ignore_index=True)

# Created compositions are multiplied for each composition in the "Stabilizers" and "Other" system
# Ex: 0.25Pd 0.75Al for 0.05Stabilizers => 0.0125 Pd 0.0375 Al
stabilizers_expanded = np.repeat(df_compositions_stabilizers_ratio['Stabilizers'].values, df_compositions_stabilizers.shape[0], axis=0)
results_tiled = np.tile(df_compositions_stabilizers.values, (df_compositions_stabilizers_ratio.shape[0], 1))
df_result = stabilizers_expanded[:, np.newaxis] * results_tiled

# Results in 150 compositions
df_result = pd.DataFrame(df_result, columns=df_compositions_stabilizers.columns)
df_compositions_stabilizers = df_result
columns_ree = df_compositions_stabilizers.columns
df_compositions_stabilizers.to_excel('Stabilizers2.xlsx')
print('There are this many alloys in the space:', len(df_compositions_stabilizers))
pd.set_option('display.float_format', '{:.8f}'.format)
print(df_compositions_stabilizers.describe())
print(df_compositions_stabilizers)





# Resolution of 50 results in increments of 0.02
resolution = 50
min_comp, max_comp = 0.01, 0.1

# System of the summation of dopants and "Other"
elements = ["Dopants", "Other"]
num = len(elements)
df_compositions_dopants_ratio = comp_gen(num, resolution, elements, min_comp, max_comp)
df_compositions_dopants_ratio = df_compositions_dopants_ratio[df_compositions_dopants_ratio['Dopants'] >= min_comp]
df_compositions_dopants_ratio = df_compositions_dopants_ratio[df_compositions_dopants_ratio['Dopants'] <= max_comp]

# Results in 5 compositions
df_compositions_dopants_ratio.to_excel('Dopants.xlsx')
print('There are this many alloys in the space:', len(df_compositions_dopants_ratio))
pd.set_option('display.float_format', '{:.8f}'.format)
print(df_compositions_dopants_ratio.describe())
print(df_compositions_dopants_ratio)

# Resolution of 4 results in increments of 25%
resolution = 4
min_comp, max_comp = 0.01, 0.1
elements = ['Cu', 'Hf', 'Zr', 'Fe', 'Co', 'Mn', 'Nb', 'V', 'Ta', 'Sn']

# Creates a maximum of 4 doping elements
num = 3
df_compositions_dopants = comp_gen(num, resolution, elements, min_comp, max_comp)

# Creates identity matrix like in the REE generation
identity_matrix = np.eye(len(df_compositions_dopants.columns))
identity_matrix = pd.DataFrame(identity_matrix, columns=df_compositions_dopants.columns)
df_compositions_dopants = pd.concat([df_compositions_dopants, identity_matrix], ignore_index=True)

# Created compositions are multiplied for each composition in the "Dopant" and "Other" system
dopants_expanded = np.repeat(df_compositions_dopants_ratio['Dopants'].values, df_compositions_dopants.shape[0], axis=0)
results_tiled = np.tile(df_compositions_dopants.values, (df_compositions_dopants_ratio.shape[0], 1))
df_result = dopants_expanded[:, np.newaxis] * results_tiled
df_result = pd.DataFrame(df_result, columns=df_compositions_dopants.columns)
df_compositions_dopants = df_result

# Compositions with dopants summing up to more than 0.1 weight percent are removed
df_compositions_dopants = df_compositions_dopants[df_compositions_dopants[elements].sum(axis=1) <= 0.1]

# Compositions with dopants summing up to less than 0.01 weight percent are removed
df_compositions_dopants = df_compositions_dopants[df_compositions_dopants[elements].sum(axis=1) >= 0.01]

# Results in 2525 compositions
columns_dopants = df_compositions_dopants.columns
df_compositions_dopants.to_excel('Dopants2.xlsx')
print('There are this many alloys in the space:', len(df_compositions_dopants))
pd.set_option('display.float_format', '{:.8f}'.format)
print(df_compositions_dopants.describe())
print(df_compositions_dopants)





# Displays number of compositions and elements
print(df_compositions_Ti.shape)
print(df_compositions_stabilizers.shape)
print(df_compositions_dopants.shape)

There are this many alloys in the space: 9
count   9.00000000
mean    0.50000000
std     0.01369306
min     0.48000000
25%     0.49000000
50%     0.50000000
75%     0.51000000
max     0.52000000
Name: Ti, dtype: float64
95    0.52000000
96    0.51500000
97    0.51000000
98    0.50500000
99    0.50000000
100   0.49500000
101   0.49000000
102   0.48500000
103   0.48000000
Name: Ti, dtype: float64
There are this many alloys in the space: 10
            Other  Stabilizers
count 10.00000000  10.00000000
mean   0.97250000   0.02750000
std    0.01513825   0.01513825
min    0.95000000   0.00500000
25%    0.96125000   0.01625000
50%    0.97250000   0.02750000
75%    0.98375000   0.03875000
max    0.99500000   0.05000000
         Other  Stabilizers
189 0.95000000   0.05000000
190 0.95500000   0.04500000
191 0.96000000   0.04000000
192 0.96500000   0.03500000
193 0.97000000   0.03000000
194 0.97500000   0.02500000
195 0.98000000   0.02000000
196 0.98500000   0.01500000
197 0.99000000   0.01000000

In [None]:
# Combine composition sections

# Creates a list of all elements
dfs = [df_compositions_Ti, df_compositions_stabilizers, df_compositions_dopants]
all_columns = ["Ni"]
for i in dfs:
    all_columns += [i.columns]
elements_backup = all_columns

# Saves all Ti compositions
result_rows = []
for _, b_row in tqdm(df_compositions_Ti.iterrows(), total=len(df_compositions_Ti), desc="Ti only"):
    result_rows.append(b_row)
result_df1 = pd.DataFrame(result_rows)
result_df1.reset_index(drop=True, inplace=True)


# Generates all possible combinations of Stabilizers and Dopant compositions
result_rows1 = []
for _, b_row in tqdm(df_compositions_stabilizers.iterrows(), total=len(df_compositions_stabilizers), desc="Stabilizers x Dopants"):
    for _, a_row in df_compositions_dopants.iterrows():
        result_rows1.append(pd.concat([b_row, a_row], axis=0))
result_df2 = pd.DataFrame(result_rows1)
result_df2.reset_index(drop=True, inplace=True)

# Generates all possible combinations of all compositions
result_rows2 = []
for _, b_row in tqdm(result_df2.iterrows(), total=len(result_df2), desc="Combined"):
    for _, a_row in result_df1.iterrows():
        result_rows2.append(pd.concat([b_row, a_row], axis=0))
result_df3 = pd.DataFrame(result_rows2)
result_df3.reset_index(drop=True, inplace=True)
print(result_df3)

Ti only: 100%|██████████| 9/9 [00:00<00:00, 11452.89it/s]
Stabilizers x Dopants: 100%|██████████| 150/150 [00:12<00:00, 12.06it/s]
Combined:  62%|██████▏   | 234147/378750 [01:13<00:38, 3780.21it/s]

In [15]:
# Calculate amount of nickel (balance) for each composition and add

# Creates a dataframe of all combinations
df_compositions3 = pd.DataFrame(result_df3.copy())
df_compositions3.reset_index(drop=True, inplace=True)
df_compositions3.drop_duplicates(inplace=True)

# Adds iron to the dataframe as the balance element
df_compositions3.insert(0, 'Ni', 1 - df_compositions3.sum(axis=1))

print(df_compositions3)

                Ni         Al         Pd         Pt         Co         Cu  \
0       0.41000000 0.00000000 0.01250000 0.03750000 0.00000000 0.00500000   
1       0.41500000 0.00000000 0.01250000 0.03750000 0.00000000 0.00500000   
2       0.42000000 0.00000000 0.01250000 0.03750000 0.00000000 0.00500000   
3       0.42500000 0.00000000 0.01250000 0.03750000 0.00000000 0.00500000   
4       0.43000000 0.00000000 0.01250000 0.03750000 0.00000000 0.00500000   
...            ...        ...        ...        ...        ...        ...   
3408745 0.39500000 0.00000000 0.00000000 0.00500000 0.00000000 0.00000000   
3408746 0.40000000 0.00000000 0.00000000 0.00500000 0.00000000 0.00000000   
3408747 0.40500000 0.00000000 0.00000000 0.00500000 0.00000000 0.00000000   
3408748 0.41000000 0.00000000 0.00000000 0.00500000 0.00000000 0.00000000   
3408749 0.41500000 0.00000000 0.00000000 0.00500000 0.00000000 0.00000000   

                Fe         Hf         Mn         Nb         Sn         Ta  

In [18]:
# Convert back to atomic/weight percent

# Defines a function that takes in a dataframe and converts back to atomic percent
def convert_to_atomic_percent(df, molar_masses):
    atomic_percent_df = pd.DataFrame()
    
    for column in df.columns:
        moles = df[column] / molar_masses[column]
        atomic_percent_df[column] = moles
    total_moles = atomic_percent_df.sum(axis=1)
    
    for column in df.columns:
        atomic_percent_df[column] = (atomic_percent_df[column] / total_moles)
        
    return atomic_percent_df

# This function takes in a DataFrame where each column represents an element, 
# and each row corresponds to the amounts of those elements in different compositions
# This function converts these values to weight percentages for each row
def convert_to_weight_percent(df, atomic_masses):
    weight_percent_df = pd.DataFrame()
    for column in df.columns:
        weight = df[column] * atomic_masses[column]
        weight_percent_df[column] = weight   
    total_weight = weight_percent_df.sum(axis=1)
    for column in df.columns:
        weight_percent_df[column] = (weight_percent_df[column] / total_weight)   
    return weight_percent_df

# Saves weight data for each element
df_data = pd.read_excel(PATH4, engine="openpyxl")

# Assigns atomic masses for each element to a variable
atomic_weights = pd.Series(df_data['Atomic Mass'].values, index=df_data['Element Symbol']).to_dict()

# Inputs compositions, converts the values to atomic percent, and assigns the dataset to a variable
df = pd.DataFrame(df_compositions3)
#df = convert_to_weight_percent(df, atomic_weights)
df = convert_to_atomic_percent(df, atomic_weights)
df_dict = df.to_dict(orient='records')



# Import Compositions (If already generated)

In [None]:
"""
# If compositions have already been generated but the model has changed, import them here

# Define the folder containing the dataset
comp_folder = os.path.join(BASE_PATH2, "Dataset")

# Load and combine all CSV files related to compositions
file_list = sorted(glob.glob(os.path.join(comp_folder, "compositions_*.csv")))
df = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)
print(f"Combined DataFrame has {len(df)} rows")


df_compositions3 = df.copy()
df_dict = df.to_dict(orient='records')

# Load and combine all CSV files related to costs
file_list = sorted(glob.glob(os.path.join(comp_folder, "costs_*.csv")))
costs = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)

# Convert the combined costs DataFrame into a NumPy array and flatten it into a 1D array
costs = costs.to_numpy()
costs = costs.ravel()
print(f"Combined DataFrame has {len(costs)} rows")
"""

'\n# If compositions have already been generated but the model has changed, import them here\n\n# Define the folder containing the dataset\ncomp_folder = os.path.join(BASE_PATH2, "Dataset")\n\n# Load and combine all CSV files related to compositions\nfile_list = sorted(glob.glob(os.path.join(comp_folder, "compositions_*.csv")))\ndf_saved = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)\nprint(f"Combined DataFrame has {len(df_saved)} rows")\n\ndf = df_saved.copy()\ndf_compositions3 = df_saved.copy()\ndf_compositions2 = df_saved.to_dict(orient=\'records\')\n\n# Load and combine all CSV files related to costs\nfile_list = sorted(glob.glob(os.path.join(comp_folder, "costs_*.csv")))\ncosts = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)\n\n# Convert the combined costs DataFrame into a NumPy array and flatten it into a 1D array\ncosts = costs.to_numpy()\ncosts = costs.ravel()\nprint(f"Combined DataFrame has {len(costs)} rows")\n'

# Property Calculations

In [19]:
# Turn compositions into strings for cost calculations

# Assigns the dataframe with compositions to a variable
compositions = df_dict

# Converts each row (composition) into a formula string without spaces
# Each composition is a dictionary where the key is the element and the value is its fraction
composition_strings = [
    "".join(f"{elem}{frac:.6f}" for elem, frac in comp.items())
    for comp in compositions
]

# Creates a new DataFrame with the composition strings and a placeholder 'target' column
df = pd.DataFrame({"formula": composition_strings})
df.insert(0, "target", [None] * len(df))

In [20]:
# Define function to calculate alloy cost

# Assigns price/kg for each element to a variable
element_costs = pd.Series(df_data['Price'].values, index=df_data['Element Symbol']).to_dict()

# Defines function to convert atomic fractions to weight percent and calculate alloy cost
def calculate_alloy_cost(composition):
    
    # Splits the composition string into individual element-fraction pairs
    elements = composition.split()
    atomic_fractions = {}

    # Extracts element symbols and their corresponding atomic fractions
    for el in elements:
        element = ''.join(filter(str.isalpha, el))
        atomic_fraction = float(el[len(element):]) if len(el) > len(element) else 1.0
        if element in atomic_weights:
            atomic_fractions[element] = atomic_fraction
            
    # Converts atomic fractions to weighted values using atomic weights
    weighted_values = {el: atomic_fractions[el] * atomic_weights[el] for el in atomic_fractions}

    # Sums total weight for normalization
    total_weight = sum(weighted_values.values())

    # Calculates weight fraction of each element
    weight_fractions = {el: weighted_values[el] / total_weight for el in atomic_fractions}

    # Computes total cost per kg by multiplying weight fractions with individual element costs
    cost_per_kg = sum(weight_fractions[el] * element_costs.get(el, 0) for el in weight_fractions)
    
    return cost_per_kg

# Converts compositions to strings, separating each element-fraction pair with a space
composition_strings = [
    " ".join(f"{elem}{frac:.6f}" for elem, frac in comp.items())
    for comp in compositions]

# Converts the list of composition strings to a pandas Series
composition_series = pd.Series(composition_strings)

# Stores the composition costs
costs = composition_series.apply(calculate_alloy_cost)

In [None]:
# Calculate target for the generated compositions using the trained model

# Predicted target values (random numbers for now), and compositions are stored
target_values = [random.randint(0, 100) for _ in range(len(df))]
compositions = df.iloc[:, 1]

In [22]:
# Save compositions into seperate csv files

# Defines the number of rows per file
chunk_size = 1_000_000

# Ensures compositions has the same column structure as df_compositions3
compositions = pd.DataFrame(df_dict, columns=df_compositions3.columns.tolist())

# Calculate how many chunks are needed based on the total number of rows
num_chunks = len(compositions) // chunk_size + int(len(compositions) % chunk_size != 0)

# Loops through each chunk and saves it as a separate CSV file
for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    df_chunk = compositions.iloc[start:end]
    filename = f"Dataset/compositions_{i+1:03}.csv"
    df_chunk.to_csv(os.path.join(BASE_PATH2, filename), index=False)
    print(f"Saved: {filename}")

Saved: Dataset/compositions_001.csv
Saved: Dataset/compositions_002.csv
Saved: Dataset/compositions_003.csv
Saved: Dataset/compositions_004.csv


In [None]:
# Save target values into seperate csv files

# Defines the number of rows per file
chunk_size = 1_000_000

# Calculate how many chunks are needed based on the total number of rows
num_chunks = len(target_values) // chunk_size + int(len(target_values) % chunk_size != 0)

# Loops through each chunk and saves it as a separate CSV file
for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    df_chunk = pd.DataFrame(target_values[start:end])
    filename = f"Dataset/target_values_{i+1:03}.csv"
    df_chunk.to_csv(os.path.join(BASE_PATH2, filename), index=False)
    print(f"Saved: {filename}")

Saved: Dataset/target_values_001.csv
Saved: Dataset/target_values_002.csv
Saved: Dataset/target_values_003.csv
Saved: Dataset/target_values_004.csv


In [24]:
# Save calculated costs into seperate csv files

# Defines the number of rows per file
chunk_size = 1_000_000

# Calculate how many chunks are needed based on the total number of rows
num_chunks = len(costs) // chunk_size + int(len(costs) % chunk_size != 0)

# Loops through each chunk and saves it as a separate CSV file
for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    df_chunk = pd.DataFrame(costs[start:end])
    filename = f"Dataset/costs_{i+1:03}.csv"
    df_chunk.to_csv(os.path.join(BASE_PATH2, filename), index=False)
    print(f"Saved: {filename}")

Saved: Dataset/costs_001.csv
Saved: Dataset/costs_002.csv
Saved: Dataset/costs_003.csv
Saved: Dataset/costs_004.csv
