### Load dataset

In [1]:
from datasets import load_dataset
path= "/work/so87pot/material_db/matbench_text/test_matbench_dielectric_4.json"
ds = load_dataset("json", data_files=path,split="train")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ds

Dataset({
    features: ['cif_p1', 'cif_symmetrized', 'cif_bonding', 'slice', 'composition', 'crystal_llm_rep', 'robocrys_rep', 'wycoff_rep'],
    num_rows: 952
})

In [4]:
dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)

In [5]:
dataset 

DatasetDict({
    train: Dataset({
        features: ['cif_p1', 'cif_symmetrized', 'cif_bonding', 'slice', 'composition', 'crystal_llm_rep', 'robocrys_rep', 'wycoff_rep'],
        num_rows: 3048
    })
    test: Dataset({
        features: ['cif_p1', 'cif_symmetrized', 'cif_bonding', 'slice', 'composition', 'crystal_llm_rep', 'robocrys_rep', 'wycoff_rep'],
        num_rows: 763
    })
})

In [16]:
import pandas as pd
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(ds)

# Filter only unique rows based on 'material_id'
unique_df = df.drop_duplicates(subset='cif_p1')

# Convert the unique DataFrame back to a dictionary
unique_data = unique_df.to_dict('records')

print("Number of unique rows based on slice:", len(unique_data))

Number of unique rows based on slice: 2032198


### Remove duplicates based on Nomad Material ID

In [4]:
import json

# Read the JSON file
with open('/work/so87pot/material_db/structllm_dataset/combined.json', 'r') as file:
    data = json.load(file)

# Create a dictionary to store unique entries based on material_id
unique_entries = {}

# Iterate over the data and filter out duplicates
for entry in data:
    material_id = entry['material_id']
    if material_id not in unique_entries:
        unique_entries[material_id] = entry

# Convert the dictionary values back to a list
filtered_data = list(unique_entries.values())

# Write the filtered data back to a new JSON file
with open('/work/so87pot/material_db/structllm_dataset/uique_combined.json', 'w') as file:
    json.dump(filtered_data, file, indent=4)


### Filter entries with None in any of representations

In [8]:
from datasets import load_dataset
import json

# Filter out entries where any of the specified fields is None
filtered_dataset = [entry for entry in ds if all(entry[field] is not None for field in ['cif_p1', 'slice', 'cif_symmetrized', 'crystal_llm_rep'])]

# Save the filtered dataset back to a JSON file
output_file_path = "/work/so87pot/material_db/structllm_dataset/filtered_data.json"
with open(output_file_path, "w") as file:
    json.dump(filtered_dataset, file, indent=4)


In [None]:
filtered_dataset = ds.filter(lambda example: 
    example['cif_p1'] is not None and 
    example['slice'] is not None and 
    example['cif_symmetrized'] is not None and 
    example['crystal_llm_rep'] is not None)


### Train validation split with all duplicate entries (slice) in one fold

In [18]:
from sklearn.model_selection import train_test_split
from collections import Counter
import json

# Function to create a mask for non-unique entries in specified fields
def create_non_unique_mask(dataset, fields):
    counts = Counter()
    for example in dataset:
        key = tuple(example[field] for field in fields)
        counts[key] += 1
    mask = [counts[tuple(example[field] for field in fields)] > 1 for example in dataset]
    return mask

# Load the dataset
path= "/work/so87pot/material_db/structllm_dataset/filtered_data.json"
dataset = load_dataset("json", data_files=path,split="train")

# Specify the fields for which non-unique entries should be grouped together
fields_to_group = ['slice', 'cif_p1', 'cif_symmetrized', 'crystal_llm_rep']

# Create a mask for non-unique entries in specified fields
non_unique_mask = create_non_unique_mask(dataset, fields_to_group)

# Split the dataset into train and test while maintaining the distribution of non-unique entries
train_indices, test_indices = train_test_split(range(len(dataset)), test_size=0.01, stratify=non_unique_mask)

# Create train and test splits
train_set = [dataset[i] for i in train_indices]
test_set = [dataset[i] for i in test_indices]

# Save train and test splits as JSON files
train_output_file = "/work/so87pot/material_db/structllm_dataset/train.json"
test_output_file = "/work/so87pot/material_db/structllm_dataset/test.json"

with open(train_output_file, "w") as train_file:
    json.dump(train_set, train_file, indent=4)

with open(test_output_file, "w") as test_file:
    json.dump(test_set, test_file, indent=4)


In [None]:
from sklearn.model_selection import train_test_split
from datasets import  DatasetDict

train_ds, val_ds = train_test_split(ds, test_size=0.2, random_state=42)

dataset_dict = DatasetDict({"train": train_ds, "validation": val_ds})

### Preapare training set of different sizes

In [5]:
from datasets import load_dataset
import json

path = "/work/so87pot/material_db/structllm_dataset/train.json"
ds = load_dataset("json", data_files=path, split="train")

# Function to sample structures and save as JSON file
def sample_and_save_dataset(dataset, sample_size, output_file):
    sampled_data = dataset.select(range(sample_size))
    sampled_data = [dict(entry) for entry in sampled_data]  # Convert Dataset to list of dictionaries
    with open(output_file, "w") as file:
        json.dump(sampled_data, file, indent=4)

# Specify sizes for the datasets
dataset_sizes = [30, 100, 300]  # Add more sizes as needed

# Sample datasets of different sizes
for size in dataset_sizes:
    # Adjust train_size parameter to control the size of the sample
    size *= 1000  # Convert size from K to actual number of examples
    sampled_data = ds.shuffle(seed=42).select(range(size))
    output_file = f"/work/so87pot/material_db/structllm_dataset/train_{size}K.json"
    sample_and_save_dataset(sampled_data, size, output_file)
    print(f"Sampled dataset of size {size}K saved to {output_file}")


Sampled dataset of size 30000K saved to /work/so87pot/material_db/structllm_dataset/train_30000K.json
Sampled dataset of size 100000K saved to /work/so87pot/material_db/structllm_dataset/train_100000K.json
Sampled dataset of size 300000K saved to /work/so87pot/material_db/structllm_dataset/train_300000K.json


### APPENDIX - Unique entry exploration

In [None]:
filtered_dataset = ds.filter(lambda example: example['robocrys_rep'] is not None)

In [22]:
import pandas as pd
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(ds)

# Filter only unique rows based on 'material_id'
unique_df = df.drop_duplicates(subset='cif_symmetrized')

# Convert the unique DataFrame back to a dictionary
unique_data = unique_df.to_dict('records')

print("Number of unique rows based on material_id:", len(unique_data))

Number of unique rows based on material_id: 83934


In [14]:
import pandas as pd

# Assuming ds is a pandas DataFrame
# If not, you can convert it to a DataFrame first
df = ds.to_pandas()

# Find non-unique material IDs
non_unique_ids = df[df.duplicated('material_id')]
#non_unique_ids = df[df.duplicated('material_id')]['material_id'].unique()

# Print non-unique material IDs
if len(non_unique_ids) > 0:
    # print("Non-unique material IDs:")
    # for id in non_unique_ids:
    #     print(id)
    print("Number of non-unique material IDs:", len(non_unique_ids))
else:
    print("All material IDs are unique.")


Number of non-unique material IDs: 2804


In [16]:
non_unique_ids.to_csv("non_unique_material_ids.csv")

In [53]:
non_unique_ids['material_id']

626      RrZXyLGk6Xk9iSFJRG71uGXBRszP
1287     Ttw5LXNw65Ps8ijvGKPOEq_aN-5a
1684     ts__UNut7x07NFC7lEKbQayiMrja
1979     EMhNSVY_nmSbWKfOxVKWg3vKbCEv
2101     RrZXyLGk6Xk9iSFJRG71uGXBRszP
                     ...             
86403    RBwb2zfiwPJAsxt99eyrlhUXO1Pr
86426    8uNdpE6Zvv65UMOSsXYae31YBt6O
86427    6Up0BfVeVWFaI0duWEVMUwCLFQIh
86451    2WotedHWs4tcLMIa8WMgB9h3eEUJ
86463    GCytew0lCW2hgMsj132JgZ4djYmi
Name: material_id, Length: 2804, dtype: object

### Handle JSON Decoding error by re-writing the json files in correct format

In [1]:
from typing import List, Dict
import json
def read_json_lines(json_file: str) -> List[Dict]:
    """
    Read JSON data from a file with each line containing a JSON object.

    Args:
        json_file (str): The path to the JSON file.

    Returns:
        List[Dict]: A list of dictionaries containing the JSON data.
    """
    data = []
    with open(json_file, 'r') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError:
                print(f"Ignoring invalid JSON object: {line}")
    return data

def save_to_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file)
        

In [2]:
import os

def process_json_files(input_dir: str, output_dir: str):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate over all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):  # Check if the file is a JSON file
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            
            # Read JSON data from the input file
            data = read_json_lines(input_path)
            
            # Save the data to a new JSON file in the output directory
            save_to_json(data, output_path)
            print(f"Processed: {input_path}")

# Example usage:
input_directory = '/path/to/input_directory'
output_directory = '/path/to/output_directory'
process_json_files(input_directory, output_directory)


In [3]:
def prep(data, filepaths):
    for data, path in zip(data, filepaths):
        data = read_json_lines(data)
        save_to_json(data, path)


prep(data,filepaths)

### Save data of one representation as CSV (to train BPE)

In [None]:
import pandas as pd

# Assuming ds is your dataset loaded using load_dataset()

# Convert the 'robocrys_rep' column to a pandas DataFrame
robocrys_df = pd.DataFrame(filtered_dataset['robocrys_rep'], columns=['robocrys_rep'])

# Save the DataFrame to a CSV file
robocrys_df.to_csv('/work/so87pot/material_db/structllm_dataset/robocrys_4.csv', index=False)


### Combine JSON files to one DATASET split

In [4]:
import json
import os

def combine_json_files(folder_path):
    combined_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                data = json.load(file)
                combined_data.extend(data)
    return combined_data

folder_path = '/work/so87pot/material_db/structllm_dataset'
combined_json = combine_json_files(folder_path)

# Write the combined JSON data to a new file
output_file_path = '/work/so87pot/material_db/structllm_dataset/combined.json'
with open(output_file_path, 'w') as output_file:
    json.dump(combined_json, output_file, indent=4)

print("Combined JSON data has been written to:", output_file_path)


Combined JSON data has been written to: /work/so87pot/material_db/structllm_dataset/combined.json
