In [1]:

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

import numpy as np
import pickle

import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

  pd.set_option('max_colwidth', -1)


In [2]:
with open('../data/counts_1m_trim.pkl', 'rb') as f:
    counts_1m = pickle.load(f)


In [3]:
counts_1m.shape

(247160, 17)

In [4]:
large_df = counts_1m[['source_cui','source_text','rel','cui_2','cui_2_text','rel_type']].copy()

# Randomly shuffle dataframe
large_df = large_df.sample(frac = 1, random_state = 150).reset_index(drop = True)
num_chunks = 5

# Calculate the size of each smaller dataframe
chunk_size = len(large_df) // num_chunks

# Create a dictionary to store the smaller dataframes
small_dataframes = {}

#2nd hop DF
hop2 = counts_1m[counts_1m.rel_type == 'RO'][['source_cui','rel','cui_2','cui_2_text']].copy()
hop2.columns = ['cui_2','rel_2','cui_3','cui_3_text']


# Split the large dataframe into smaller dataframes and store them in the dictionary
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < num_chunks - 1 else len(large_df)
    small_dataframes[f'df{i+1}'] = large_df.iloc[start_idx:end_idx]

# Access the smaller dataframes from the dictionary
for i in range(5):
    df_name = f'df{i+1}'
    small_df = small_dataframes[df_name]
    print(f"Dataframe {df_name}:")
    print(f"Shape: {small_df.shape}")
    small_df = small_df.merge(hop2, on = 'cui_2')
    # 1st and 3rd node cannot be the same
    small_df = small_df[small_df.source_cui != small_df.cui_3]
    # print(small_df.columns)
    small_df['whole_text'] = small_df.apply(lambda x: ' '.join([str(x['source_text']),
                                                            str(x['rel']),
                                                            str(x['cui_2_text']),
                                                            str(x['rel_2']),
                                                            str(x['cui_3_text'])]), axis=1)
    small_dataframes[df_name] = small_df
    print(f"Shape after join: {small_df.shape}")

Dataframe df1:
Shape: (49432, 6)
Shape after join: (4476067, 10)
Dataframe df2:
Shape: (49432, 6)
Shape after join: (4476641, 10)
Dataframe df3:
Shape: (49432, 6)
Shape after join: (4543464, 10)
Dataframe df4:
Shape: (49432, 6)
Shape after join: (4374090, 10)
Dataframe df5:
Shape: (49432, 6)
Shape after join: (4435315, 10)


## Samples from batch 1

In [16]:
# Function to randomly select 3 rows from each group
def select_random_rows(group):
    if len(group) >= 3:
        return group.sample(3, random_state=42)  # You can change the random_state for different random selections
    else:
        return group
df = pd.DataFrame()
for i in range(1,6):
    df_key = f"df{i}"
    df1 = small_dataframes[df_key].copy()
    # Group by the column containing duplicates and apply the selection function
    df1_1 = df1.groupby('source_cui', group_keys=False).apply(select_random_rows)
    df1_1.reset_index(drop=True, inplace=True)

    df1_2 = df1.groupby('cui_2', group_keys=False).apply(select_random_rows)
    df1_2.reset_index(drop=True, inplace=True)

    df1_3 = df1.groupby('cui_3', group_keys=False).apply(select_random_rows)
    df1_3.reset_index(drop=True, inplace=True)

    df1 = pd.concat([df1_1,df1_2,df1_3])
    df1.drop_duplicates('whole_text', keep = 'first', inplace = True)
    df1.reset_index(drop = True, inplace = True)

    df = pd.concat([df, df1], ignore_index = True)

In [19]:
print(df.shape)
df = df.drop_duplicates('whole_text', keep = 'first')
df.reset_index(drop = True, inplace = True)
print(df.shape)

(1255340, 10)
(1255340, 10)


In [21]:
sample_1 = df.loc[:418446]
sample_2 = df.loc[418446:836892]
sample_3 = df.loc[836892:]

In [23]:
sample_1.to_csv("../data/1mm_even_batches/1mm_even_1.csv", index = False)
sample_2.to_csv("../data/1mm_even_batches/1mm_even_2.csv", index = False)
sample_3.to_csv("../data/1mm_even_batches/1mm_even_3.csv", index = False)

### Batch 2

In [5]:

# df = small_dataframes['df2']

# sample_1 = df.sample(30000, random_state = 110)
# sample_2 = df[~df.index.isin(sample_1.index)].sample(30000, random_state = 111)
# indices = list(sample_1.index) + list(sample_2.index)
# sample_3 = df[~df.index.isin(indices)].sample(30000, random_state = 112)
# indices = list(sample_1.index) + list(sample_2.index) + list(sample_3.index)
# sample_4 = df[~df.index.isin(indices)].sample(30000, random_state = 113)

# sample_1.to_csv('../data/1mm_final_trim/1mm_batch_2_1.csv', index = False)
# sample_2.to_csv('../data/1mm_final_trim/1mm_batch_2_2.csv', index = False)
# sample_3.to_csv('../data/1mm_final_trim/1mm_batch_2_3.csv', index = False)
# sample_4.to_csv('../data/1mm_final_trim/1mm_batch_2_4.csv', index = False)

## Samples from batch 3

In [6]:
df = small_dataframes['df3']

sample_1 = df.sample(50000, random_state = 110)
sample_2 = df[~df.index.isin(sample_1.index)].sample(50000, random_state = 111)
indices = list(sample_1.index) + list(sample_2.index)
sample_3 = df[~df.index.isin(indices)].sample(50000, random_state = 112)
indices = list(sample_1.index) + list(sample_2.index) + list(sample_3.index)
sample_4 = df[~df.index.isin(indices)].sample(50000, random_state = 113)

sample_1.to_csv('../data/1mm_final_trim/1mm_batch_3_1.csv', index = False)
sample_2.to_csv('../data/1mm_final_trim/1mm_batch_3_2.csv', index = False)
sample_3.to_csv('../data/1mm_final_trim/1mm_batch_3_3.csv', index = False)
sample_4.to_csv('../data/1mm_final_trim/1mm_batch_3_4.csv', index = False)

In [None]:
# df = pd.read_csv('../data/1mm_batches/1mm_batch_df2.csv')

# sample_1 = df.sample(12000, random_state = 123)
# sample_2 = df[~df.index.isin(sample_1.index)].sample(12000, random_state = 123)
# indices = list(sample_1.index) + list(sample_2.index)
# sample_3 = df[~df.index.isin(indices)].sample(12000, random_state = 123)

# sample_1.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df2_1.csv', index = False)
# sample_2.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df2_2.csv', index = False)
# sample_3.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df2_3.csv', index = False)

### Sample from batch 3

In [None]:
# df = pd.read_csv('../data/1mm_batches/1mm_batch_df3.csv')
# df = df[(df.rel != 'do not code with') & (df.rel != 'do not code with')].reset_index(drop = True)

# sample_1 = df.sample(30000, random_state = 123)
# sample_2 = df[~df.index.isin(sample_1.index)].sample(30000, random_state = 123)
# indices = list(sample_1.index) + list(sample_2.index)
# sample_3 = df[~df.index.isin(indices)].sample(30000, random_state = 123)

# sample_1.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df3_1.csv', index = False)
# sample_2.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df3_2.csv', index = False)
# sample_3.to_csv('../data/1mm_batches/sub_batch_1/1mm_batch_df3_3.csv', index = False)