In [29]:
import rdkit
from rdkit import Chem
import pandas as pd
import json
import ast


In [25]:
df = pd.read_csv('MatDX_EF.csv')
df['formation_energy'][0]

"{'reference': {'Ni': 'https://nomad-lab.eu/prod/rae/gui/entry/id/6ITh58J5QcGWUDsmpR9t2A/DW3mGBiIv6M10SVC39rmNQ3rZWvj', 'Ta': 'https://nomad-lab.eu/prod/rae/gui/entry/id/mAAfBZKmTQWDVL1ZPRsrVQ/aiinBOOtWpDnw1fT1PNtrrjHAAiJ'}, 'value_per_atom': 0.10174024974414664, 'value': 1.0174024974414664}"

In [12]:
unique_elements = set()
for formula in df['formula']:
    # Extract elements based on uppercase letters followed by lowercase letters or numbers
    elements = [part for part in formula if part.isalpha()]
    unique_elements.update(elements)

# Count unique elements and unique space groups
num_unique_elements = len(unique_elements)
num_unique_space_groups = df['space_group'].nunique()

num_unique_formula = df['formula'].nunique()

num_unique_elements, num_unique_space_groups, num_unique_formula

(35, 78, 3282)

In [13]:
# Sample code to identify how many examples in 'df' have the same 'formula' but different 'space_group'

# Group by 'formula' and check unique 'space_group' counts for each group
duplicates_with_different_space_group = df.groupby('formula')['space_group'].nunique()

# Filter to find cases where there is more than one unique 'space_group' for the same 'formula'
duplicates_count = (duplicates_with_different_space_group > 1).sum()

duplicates_count


np.int64(687)

In [23]:
duplicates_with_different_space_group = df.groupby('formula')['space_group'].nunique()

# Filter for formulas with more than one unique 'space_group'
duplicates = duplicates_with_different_space_group[duplicates_with_different_space_group > 1]

# Display each formula and the count of unique space groups
# for formula, unique_count in duplicates.items():
#     print(f"Formula: {formula}, Unique Space Groups: {unique_count}")

len(duplicates)

687

In [27]:
# Count occurrences of each (formula, space_group) pair to find exact duplicates
duplicate_pairs = df.groupby(['formula', 'space_group']).size()

# Filter for pairs that occur more than once
exact_duplicates = duplicate_pairs[duplicate_pairs > 1]

# Display the result
exact_duplicates_df = exact_duplicates.reset_index(name='count')
exact_duplicates_df


Unnamed: 0,formula,space_group,count
0,Ac4Br6,R-3c,2
1,Ac4I6,R-3c,2
2,AcAl3,I4/mmm,2
3,AcMn,P4/mmm,2
4,Ag2Al,I4/mmm,2
...,...,...,...
651,WBr,Cmmm,2
652,WP3,Fm-3m,2
653,WSb,Pm-3m,4
654,YbBr,Fm-3m,2


In [36]:
df = df.reset_index().rename(columns={'index': 'original_index'})

# Count occurrences of each (formula, space_group) pair to find exact duplicates
duplicate_pairs = df.groupby(['formula', 'space_group']).size()
exact_duplicates = duplicate_pairs[duplicate_pairs > 1].reset_index(name='count')

# Filter the original DataFrame to include only exact duplicate pairs and keep original indices
duplicate_df = df.merge(exact_duplicates[['formula', 'space_group']], on=['formula', 'space_group'])


duplicate_df['formation_energy'] = duplicate_df['formation_energy'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract 'value' from the 'formation_energy' dictionary
duplicate_df['formation_energy_value'] = duplicate_df['formation_energy'].apply(lambda x: x.get('value') if isinstance(x, dict) else None)

# Group by 'formula' and 'space_group' to count unique 'formation_energy_value'
value_consistency = duplicate_df.groupby(['formula', 'space_group'])['formation_energy_value'].nunique()
different_values_df = value_consistency[value_consistency > 1].reset_index(name='unique_value_count')

# Count datapoints for each (formula, space_group) in duplicate_df and include original indices
datapoint_counts = duplicate_df.groupby(['formula', 'space_group']).agg({'original_index': list, 'formation_energy_value': 'size'}).reset_index()
datapoint_counts = datapoint_counts.rename(columns={'formation_energy_value': 'datapoint_count'})

# Merge with the unique_value_count to show the count of unique formation energy values per pair
merged_counts = datapoint_counts.merge(different_values_df, on=['formula', 'space_group'], how='left').fillna(0)
merged_counts['unique_value_count'] = merged_counts['unique_value_count'].astype(int)

# Display the final result
merged_counts

Unnamed: 0,formula,space_group,original_index,datapoint_count,unique_value_count
0,Ac4Br6,R-3c,"[585, 3834]",2,2
1,Ac4I6,R-3c,"[14, 1021]",2,2
2,AcAl3,I4/mmm,"[1840, 1967]",2,2
3,AcMn,P4/mmm,"[82, 255]",2,2
4,Ag2Al,I4/mmm,"[572, 1002]",2,2
...,...,...,...,...,...
651,WBr,Cmmm,"[3046, 4520]",2,2
652,WP3,Fm-3m,"[1934, 2060]",2,2
653,WSb,Pm-3m,"[244, 1098, 2117, 2460]",4,4
654,YbBr,Fm-3m,"[3119, 3129]",2,2


In [34]:
duplicate_df

Unnamed: 0,formula,space_group,structure,id,formation_energy,formation_energy_value
0,Ir2Na2,Imma,"[{'data': {'a': [-1.36776653e-10, 2.60817299e-...",PN_BAO7ZSBZZCMA7PH7QMR7WLDI72GOELND,{'reference': {'Na': 'https://nomad-lab.eu/pro...,1.888520
1,Sr6Br4,R-3c,"[{'data': {'a': [-7.710824190000001e-10, 0, 0]...",PN_ZYAHBKLTYXJOLYLD5YFTVBNOSDYIGR6J,{'reference': {'Br': 'https://nomad-lab.eu/pro...,-11.913768
2,InSn5,Cm,"[{'data': {'a': [1.68095558e-10, 6.34838737000...",PN_7OK3KACOB6F3RRLJTWVVBYDZFINCNOIS,{'reference': {'In': 'https://nomad-lab.eu/pro...,0.002133
3,Ga2Sr,P6/mmm,"[{'data': {'a': [4.35579e-10, -3.7e-15, 0], 'b...",PN_ZDKNO2QVSUX4H46TI7XIY2SR75JR6I4L,{'reference': {'Ga': 'https://nomad-lab.eu/pro...,-1.601563
4,Pd2Ta2,P4/nmm,"[{'data': {'a': [0, -2.3350759e-10, -2.3350759...",PN_2THSMK54QTI4S5CDHJ5HHAJ2UZJXQI7E,{'reference': {'Pd': 'https://nomad-lab.eu/pro...,-1.429038
...,...,...,...,...,...,...
1478,Cu2W,Immm,"[{'data': {'a': [3.75263798e-10, 0, 0], 'b': [...",PN_G2UCKGHAVQE4P75WPPF56HJPPJPA4M5K,{'reference': {'Cu': 'https://nomad-lab.eu/pro...,2.725505
1479,Ag4Al2,Cmcm,"[{'data': {'a': [1.47221306e-10, -7.45385031e-...",PN_HONKWOZF4SL4XM27MTXHPOTWF3AKINEF,{'reference': {'Ag': 'https://nomad-lab.eu/pro...,-0.415697
1480,Ag4Al2,Cmcm,"[{'data': {'a': [1.48748225e-10, -7.23077817e-...",PN_LCUQPQWY4DLOZ4W5V3LWZ2U65NC6WVIT,{'reference': {'Ag': 'https://nomad-lab.eu/pro...,-0.290541
1481,Ba2Ir2,Cmcm,"[{'data': {'a': [1.5932730200000001e-10, -6.05...",PN_JD4XZSKWEZDKH7L7R5UWEUZ5VOW3KUPB,{'reference': {'Ir': 'https://nomad-lab.eu/pro...,2.163858


In [None]:
# JSON 문자열을 파이썬 딕셔너리로 변환
data = json.loads(data_str.replace("'", "\""))[0]['data']

# CIF 파일 포맷 생성
cif_content = "data_generated_structure\n\n"

# 격자 벡터 정보 추가
cif_content += f"_cell_length_a    {abs(data['a'][0]):.6f}\n"
cif_content += f"_cell_length_b    {abs(data['b'][1]):.6f}\n"
cif_content += f"_cell_length_c    {abs(data['c'][2]):.6f}\n"
cif_content += "_cell_angle_alpha   90.0\n"
cif_content += "_cell_angle_beta    90.0\n"
cif_content += "_cell_angle_gamma   120.0\n\n"

# 격자 정보의 대칭성 정보 (임시로 P1으로 지정)
cif_content += "loop_\n"
cif_content += "_symmetry_equiv_pos_as_xyz\n"
cif_content += "  'x, y, z'\n\n"

# 원자 정보 추가
cif_content += "loop_\n"
cif_content += "_atom_site_label\n"
cif_content += "_atom_site_type_symbol\n"
cif_content += "_atom_site_fract_x\n"
cif_content += "_atom_site_fract_y\n"
cif_content += "_atom_site_fract_z\n"

# 원자 좌표를 CIF에 추가
for atom in data['atoms']:
    element = atom['element']
    x = atom['x'] / abs(data['a'][0])
    y = atom['y'] / abs(data['b'][1])
    z = atom['z'] / abs(data['c'][2])
    cif_content += f"{element} {element} {x:.6f} {y:.6f} {z:.6f}\n"

# 생성된 CIF 내용 출력 또는 파일로 저장
with open("generated_structure.cif", "w") as file:
    file.write(cif_content)

print("CIF file has been created: generated_structure.cif")