In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
dir_path = Path("C:/Users/abcd2/Datasets/2022_icml_lens_sim/geoff_30000")

df_meta = pd.read_csv(f"{dir_path}/metadata.csv")
print(df_meta.shape)

df_meta['img_name'] = df_meta['img_path'].str[-13:]
print(df_meta.shape)

df_meta.head(2)

(30000, 18)
(30000, 19)


Unnamed: 0,img_path,theta_E,gamma,center_x,center_y,e1,e2,gamma_ext,psi_ext,source_x,source_y,source_n_sersic,source_R_sersic,sersic_source_e1,sersic_source_e2,lens_light_n_sersic,lens_light_R_sersic,num_quasars,img_name
0,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,0.935763,2.024451,0.027409,-0.019474,0.078762,0.111293,0.018954,2.066616,-0.140048,0.104141,0.517302,0.198462,0.0,0.5,3.987202,0.931056,2.0,X_0000000.npy
1,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,1.425321,1.942201,-0.000247,-0.003383,0.056564,0.043854,0.013171,0.562552,0.156502,-0.041339,0.57878,0.188306,0.0,0.5,4.019903,1.096789,2.0,X_0000001.npy


In [3]:
# There are 6 kinds of psf sorted by order, aka psf1: 0-4999, psf2: 5000-9999, etc.
n_chunks = 6
df_chunks = np.array_split(df_meta, n_chunks)

random_state = 0
test_size = 0.2
df_test_list = []
df_train_list = []

print(f"Split into {n_chunks} dfs\n")
for i in range(n_chunks):
    print(f"index of the chunk {i} = {df_chunks[i].index}")
    
    _df_train, _df_test = train_test_split(df_chunks[i], test_size=test_size, random_state=random_state)
    df_train_list.append(_df_train)
    df_test_list.append(_df_test)
    print(f"    train samples = {_df_train.shape[0]}")
    print(f"    test samples = {_df_test.shape[0]}\n")
    

print("Concatenate the chunks")
df_train = pd.concat(df_train_list)
df_test = pd.concat(df_test_list)

Split into 6 dfs

index of the chunk 0 = RangeIndex(start=0, stop=5000, step=1)
    train samples = 4000
    test samples = 1000

index of the chunk 1 = RangeIndex(start=5000, stop=10000, step=1)
    train samples = 4000
    test samples = 1000

index of the chunk 2 = RangeIndex(start=10000, stop=15000, step=1)
    train samples = 4000
    test samples = 1000

index of the chunk 3 = RangeIndex(start=15000, stop=20000, step=1)
    train samples = 4000
    test samples = 1000

index of the chunk 4 = RangeIndex(start=20000, stop=25000, step=1)
    train samples = 4000
    test samples = 1000

index of the chunk 5 = RangeIndex(start=25000, stop=30000, step=1)
    train samples = 4000
    test samples = 1000

Concatenate the chunks


In [4]:
print(df_train.shape)
df_train.head(2)

(24000, 19)


Unnamed: 0,img_path,theta_E,gamma,center_x,center_y,e1,e2,gamma_ext,psi_ext,source_x,source_y,source_n_sersic,source_R_sersic,sersic_source_e1,sersic_source_e2,lens_light_n_sersic,lens_light_R_sersic,num_quasars,img_name
2913,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,1.532587,2.362619,-0.014236,0.054528,0.3403,-0.127984,0.01269,3.234563,-0.228248,-0.148984,0.515455,0.203373,0.0,0.5,4.025867,1.010337,2.0,X_0002913.npy
3275,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,1.11734,1.983487,0.034227,0.004372,-0.138838,-0.246106,0.011625,1.948634,0.079322,-0.028258,0.564534,0.203465,0.0,0.5,3.980787,1.011608,4.0,X_0003275.npy


In [5]:
print(df_test.shape)
df_test.head(2)

(6000, 19)


Unnamed: 0,img_path,theta_E,gamma,center_x,center_y,e1,e2,gamma_ext,psi_ext,source_x,source_y,source_n_sersic,source_R_sersic,sersic_source_e1,sersic_source_e2,lens_light_n_sersic,lens_light_R_sersic,num_quasars,img_name
398,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,1.372771,2.028347,0.015123,0.023262,0.127864,-0.070566,0.013856,2.766934,-0.117461,-0.076908,0.502954,0.201018,0.0,0.5,4.23445,0.927794,2.0,X_0000398.npy
3833,/content/drive/MyDrive/Lensing_Sim_Data/dev_25...,0.619912,2.028633,0.004177,-0.022132,0.301019,-0.237736,0.02359,0.458588,-0.032704,-0.060639,0.530509,0.202213,0.0,0.5,3.914122,0.995293,4.0,X_0003833.npy


In [6]:
df_train.to_csv(f"{dir_path}/metadata_train.csv", index=False)
df_test.to_csv(f"{dir_path}/metadata_test.csv", index=False)