In [1]:
!python -m ipykernel install --user --name=venv --display-name "Python (Generic)"


import pandas as pd
import sys
from pathlib import Path
# Add parent directory of the notebook to sys.path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))
from MatNexus import PaperCollector, TextProcessor, VecGenerator, VecVisualizer
import plotly.io as py
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Doc2Vec, Word2Vec
from gensim.models.doc2vec import TaggedDocument

from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
from pathlib import Path
import re

nltk.download('punkt', quiet=True)

Installed kernelspec venv in C:\Users\doaam\AppData\Roaming\jupyter\kernels\venv


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\doaam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:

# Word2Vec parameters
model_path = "HEA_electrocatalyst_iterative.model"  
sg = 1           # Skip-gram (1) vs CBOW (0)
vector_size = 200  # Dimension of word vectors
hs = 1           # Use hierarchical softmax
window = 5       # Context window size
min_count = 1    # Minimum word frequency
workers = 4      # Number of worker threads

# Load your **final selected** documents
input_path = "final_selected_docs.csv"
processed_df = pd.read_csv(input_path)

# Load the processed DataFrame
processed_df = pd.read_csv(input_path)

# Generate the corpus from the processed DataFrame
corpus = VecGenerator.Corpus(processed_df)
sentences = corpus.sentences

# Create and train the Word2Vec model
vec_generator = VecGenerator.Word2VecModel(sentences)
vec_generator.fit()

# Save the model to the specified path
vec_generator.save(model_path)

print(f"Word2Vec model saved to {model_path}")


Word2Vec model saved to HEA_electrocatalyst_iterative.model


In [3]:
def process_all_edx_folders_and_save_csv(base_dir, model_path, summary_output_file="Top5_Similarity_Summary.csv"):
    print(f"[INFO] Loading Word2Vec model from: {model_path}")
    model = VecGenerator.Word2VecModel.load(model_path)
    print("[INFO] Model loaded successfully.")

    # Load 'resistance' vector once
    resistance_vec = VecGenerator.VectorOperations.get_vector("resistance", model)
    top5_all = []

    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if not os.path.isdir(folder_path):
            continue
        found_file = False

        for file in os.listdir(folder_path):
            if "_Similarity" in file or "_processed" in file:
                continue

            if file.lower().endswith((".csv", ".xlsx")) and "resistance" not in file.lower():

                found_file = True
                file_path = os.path.join(folder_path, file)
                try:
                    if file.endswith(".csv"):
                        df = pd.read_csv(file_path)
                    else:
                        df = pd.read_excel(file_path)
                except Exception as e:
                    print(f"[ERROR] Failed to read file: {e}")
                    continue

                if df.empty:
                    print("[WARNING] Empty DataFrame. Skipping.")
                    continue
                drop_cols = ["Index", "Spektrum", "x", "y", 'Unnamed: 0']
                df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors="ignore")
                df.reset_index(inplace=True)
                if "Index" in df.columns:
                    df = df.drop(columns=["Index"])

                df = df.apply(pd.to_numeric, errors='coerce')
                element_columns = df.select_dtypes(include='number').columns.tolist()
                element_columns = [col for col in element_columns if col not in ["index"]]

                df["Composition"] = df[element_columns].apply(
                    lambda row: ''.join([f"{col}{row[col]:.2f}" for col in element_columns]), axis=1
                )
                print(df.columns)
                print("[INFO] Generating composition vectors...")
                try:
                    df["Composition_Vector"] = df["Composition"].apply(
                        lambda formula: VecGenerator.VectorOperations.generate_material_vector(formula, model)
                    )
                except Exception as e:
                    print(f"[ERROR] Vector generation failed: {e}")
                    continue

                # Directly compute similarity (no need for condition)
                df["Similarity_to_Resistance"] = df["Composition_Vector"].apply(
                    lambda vec: cosine_similarity([vec], [resistance_vec])[0][0]
                )

                top5 = df.nlargest(5, "Similarity_to_Resistance").copy()
                top5["SourceFile"] = file
                top5["Folder"] = folder
                top5_all.append(top5[["index", "Composition", "Similarity_to_Resistance", "SourceFile", "Folder"]])

                output_file = os.path.join(folder_path, f"{os.path.splitext(file)[0]}_Similarity.csv")
                df.drop(columns=["Composition_Vector"], inplace=True)
                df.to_csv(output_file, index=False)
                

        if not found_file:
            print("[INFO] No EDX file found in this folder.")

    if top5_all:
        summary_df = pd.concat(top5_all, ignore_index=True)
        summary_path = os.path.join(base_dir, summary_output_file)
        summary_df.to_csv(summary_path, index=False)
  
    else:
        print("[INFO] No similarity data was calculated to summarize.")


In [4]:

notebook_dir = Path().resolve()
base_dir = notebook_dir.parents[1]
resistance_raw_path = base_dir / "data" / "raw" / "resistance_raw"

process_all_edx_folders_and_save_csv(
    base_dir=resistance_raw_path,
    model_path="HEA_electrocatalyst_iterative.model",
)


[INFO] Loading Word2Vec model from: HEA_electrocatalyst_iterative.model
[INFO] Model loaded successfully.
Index(['index', 'Ag', 'Au', 'Pd', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Ag', 'Au', 'Pd', 'Pt', 'Rh', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Au', 'Pd', 'Pt', 'Rh', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Au', 'Pd', 'Pt', 'Rh', 'Ru', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Ru', 'Rh', 'Pd', 'Ir', 'Pt', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Au', 'Cu', 'Pd', 'Pt', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Ag', 'Au', 'Pd', 'Pt', 'Composition'], dtype='object')
[INFO] Generating composition vectors...
Index(['index', 'Ag', 'Au', 'Cu', 'Pd', 'Pt', 'Composition'], dtype='object')
[INFO] Genera