# 1. Read original three csv files into DataFrames

In [24]:
import pandas as pd
positive_pre = pd.read_csv('positive_pre.csv')
negative_pre = pd.read_csv('negative_pre.csv')
candidate_pre = pd.read_csv('candidate_pre.csv')

In [25]:
positive_pre

Unnamed: 0,ID
0,mp-754958
1,mp-1080749
2,mp-36447
3,mp-546936
4,mp-755882
...,...
99,mp-1043484
100,mp-1106201
101,mp-542850
102,mp-560090


In [26]:
negative_pre

Unnamed: 0,ID
0,mp-1080351
1,mp-752798
2,mp-7739
3,mp-1518570
4,mp-2228371
...,...
21687,mp-755191
21688,mp-1211060
21689,mp-21330
21690,mp-1201549


In [27]:
candidate_pre

Unnamed: 0,ID
0,mp-1228140
1,mp-1344634
2,mp-686442
3,mp-1195162
4,mp-1022188
...,...
45281,mp-753206
45282,mp-772964
45283,mp-556905
45284,mp-2230185


# 2. Check if there are common materials in positive_pre and negative_pre

In [28]:
# Find common 'ID' entries (ID is the name of the column I give)
common_ids = positive_pre.merge(negative_pre, on='ID', how='inner')

# Display the common entries
print("\nCommon entries in 'ID' column of both DataFrames:")
print(common_ids)


Common entries in 'ID' column of both DataFrames:
           ID
0    mp-18745
1    mp-20612
2    mp-20970
3    mp-22973
4     mp-4233
5  mp-1018739
6    mp-28416
7  mp-1205500


#### Surprisingly, there are 8 materials that are positive claimed by Yanase's 2018 paper, but they appear in negative_pre. The explanation for this I can think of now is that, suppose our categries to screen out negative materials are right, even though these 8 materials are in negative_ore, but due to some complicated symmetry properties, they turn out to be indeed positive. Maybe should look at Yanase's paper more closely. (2025.01.10 note: another possible reason is that accuracy error)
#### But for now, let us just exclude these 8 materials from negative_pre, and keep them in the positive_pre.
#### Also, next, let us see specifically, under which categry we get these 8 materials in negative_pre. There are two main categries: 1) have odd number of magnetic atoms in the primitive unit cell; 2) have even number of magnetic atoms, then furtherly, under three conditions (with space inversion symmetry, with two magnetic atoms in the premitive unit cell, both of the magnetic atoms are at inversion centers).

In [29]:
# In data_process.ipynb, we already get two files: 
# odd_atoms.csv for materials with odd number of magnetic atoms in the primitive unit cell;
# con3_result.csv for materials which are after the even number of magnetuc atoms and three more conditions.

# We first read these two csv files into DataFrames (remember put them in the local folder)
odd_atoms = pd.read_csv('odd_atoms.csv')
con3_result = pd.read_csv('con3_result.csv')

In [30]:
# Find common 'ID' entries originating from odd_atoms
common_ids = positive_pre.merge(odd_atoms, on='ID', how='inner')

# Display the common entries
print("\nCommon entries in 'ID' column of both DataFrames:")
print(common_ids)


Common entries in 'ID' column of both DataFrames:
           ID
0    mp-28416
1  mp-1205500


In [31]:
# Find common 'ID' entries originating from con3_result.
common_ids = positive_pre.merge(con3_result, on='ID', how='inner')

# Display the common entries
print("\nCommon entries in 'ID' column of both DataFrames:")
print(common_ids)


Common entries in 'ID' column of both DataFrames:
           ID
0    mp-18745
1    mp-20612
2    mp-20970
3    mp-22973
4     mp-4233
5  mp-1018739


# 3. Exclude the 8 materials from negative_pre, becomes negative_pre_8excluded

In [32]:
# Exclude rows in 'negative_pre' that are present in 'positive_pre'
negative_pre_8excluded = negative_pre[~negative_pre['ID'].isin(positive_pre['ID'])]
negative_pre_8excluded

Unnamed: 0,ID
0,mp-1080351
1,mp-752798
2,mp-7739
3,mp-1518570
4,mp-2228371
...,...
21687,mp-755191
21688,mp-1211060
21689,mp-21330
21690,mp-1201549


In [33]:
# Check
# Find common 'ID' entries (ID is the name of the column I give)
common_ids = positive_pre.merge(negative_pre_8excluded, on='ID', how='inner')

# Display the common entries
print("\nCommon entries in 'ID' column of both DataFrames:")
print(common_ids)


Common entries in 'ID' column of both DataFrames:
Empty DataFrame
Columns: [ID]
Index: []


# 4. Exclude any materials in candidate_pre that are in positive_pre

In [34]:
# Find common 'ID' entries
common_ids = positive_pre.merge(candidate_pre, on='ID', how='inner')
common_ids

Unnamed: 0,ID
0,mp-754958
1,mp-1080749
2,mp-36447
3,mp-546936
4,mp-755882
...,...
64,mp-559680
65,mp-19367
66,mp-1043484
67,mp-1106201


In [35]:
# Exclude rows in 'candidate_pre' that are present in 'positive_pre'
# After this, I call this candidate.
candidate = candidate_pre[~candidate_pre['ID'].isin(positive_pre['ID'])]
candidate

Unnamed: 0,ID
0,mp-1228140
1,mp-1344634
2,mp-686442
3,mp-1195162
4,mp-1022188
...,...
45281,mp-753206
45282,mp-772964
45283,mp-556905
45284,mp-2230185


# 5. Randomly split positive_pre into positive and test_positive

In [36]:
import pandas as pd

# Specify the number of rows for test_positive
num_rows_test_positive = 14  

# Step 1: Randomly shuffle the DataFrame
positive_pre_shuffled = positive_pre.sample(frac=1, random_state=666).reset_index(drop=True)

# Step 2: Split the DataFrame into two parts
test_positive = positive_pre_shuffled.iloc[:num_rows_test_positive] # First DataFrame with the specified number of rows  
positive = positive_pre_shuffled.iloc[num_rows_test_positive:]  # Second DataFrame with the remaining rows

In [37]:
positive

Unnamed: 0,ID
14,mp-4340
15,mp-1080749
16,mp-569858
17,mp-19399
18,mp-3579
...,...
99,mp-22417
100,mp-31513
101,mp-2719
102,mp-1018800


In [38]:
test_positive

Unnamed: 0,ID
0,mp-567240
1,mp-20128
2,mp-1106201
3,mp-30409
4,mp-21131
5,mp-19857
6,mp-8613
7,mp-29207
8,mp-746030
9,mp-28416


#### Note: the randomness is happenning at positive_pre_shuffled. After this shuffle, you just choose the first 14 as test_positive, the remaining is positive.

# 6. Randomly split negative_pre_8excluded into negative and test_negative

In [39]:
import pandas as pd

# Specify the number of rows for test_negative
num_rows_test_negative = 83  

# Step 1: Randomly shuffle the DataFrame
negative_pre_8excluded_shuffled = negative_pre_8excluded.sample(frac=1, random_state=666).reset_index(drop=True)

# Step 2: Split the DataFrame into two parts
test_negative = negative_pre_8excluded_shuffled.iloc[:num_rows_test_negative] # First DataFrame with the specified number of rows  
negative = negative_pre_8excluded_shuffled.iloc[num_rows_test_negative:]  # Second DataFrame with the remaining rows

In [40]:
negative

Unnamed: 0,ID
83,mp-1183723
84,mp-1519221
85,mp-1184914
86,mp-2470
87,mp-2210754
...,...
21679,mp-1217954
21680,mp-1518903
21681,mp-10209
21682,mp-1223176


In [41]:
test_negative

Unnamed: 0,ID
0,mp-1034697
1,mp-1028272
2,mp-689925
3,mp-1233009
4,mp-1018084
...,...
78,mp-2714651
79,mp-545756
80,mp-772554
81,mp-1520168


# 7. Save DataFrames to csv files in save_files

In [42]:
# Saving the DataFrame to a CSV file
positive.to_csv('save_files/positive.csv', index=False)
test_positive.to_csv('save_files/test_positive.csv', index=False)
negative.to_csv('save_files/negative.csv', index=False)
test_negative.to_csv('save_files/test_negative.csv', index=False)
candidate.to_csv('save_files/candidate.csv', index=False)

# Appendix 1. Sample datasets

In [52]:
sample_positive = positive['ID'].sample(n=10, random_state=666).reset_index(drop=True)
sample_negative = negative['ID'].sample(n=30, random_state=666).reset_index(drop=True)
sample_candidate = candidate['ID'].sample(n=44, random_state=666).reset_index(drop=True)

sample_positive.to_csv('save_files_sample/sample_positive.csv', index=False)
sample_negative.to_csv('save_files_sample/sample_negative.csv', index=False)
sample_candidate.to_csv('save_files_sample/sample_candidate.csv', index=False)