In [1]:

import pandas as pd
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('max_colwidth', -1)

import numpy as np
import pickle

import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from tqdm import tqdm
from utils import *

In [3]:
df = pd.read_csv('../data/processed/combined_1mm_titles_v2.csv')

In [4]:
with open('../data/all_dicts.pkl', 'rb') as f:
    all_dicts = pickle.load(f)
desc_dict = all_dicts['desc_dict']

In [5]:
df['source_text'] = df.source_cui.apply(lambda x: desc_dict[x])
df['cui_2_text'] = df.cui_2.apply(lambda x: desc_dict[x])
df['cui_3_text'] = df.cui_3.apply(lambda x: desc_dict[x])

In [6]:
import unidecode 
def get_point(n):
    n = n.strip()
    n = n.replace('(', '')
    n = n.replace('\"', '')
    n = n.replace(')', '')
    n = n.replace(',', ' ')
    n = n.replace('_', ' ')
    n = n.replace('  ',' ')
    n = unidecode.unidecode(n)

    return n

In [7]:
df['source_clean'] = df.source_text.apply(lambda x: get_point(x))
df['cui_2_clean'] = df.cui_2_text.apply(lambda x: get_point(x))
df['cui_3_clean'] = df.cui_3_text.apply(lambda x: get_point(x))

In [8]:
df['source'] = df.apply(lambda x: f"['{x['source_clean']}', '{x['cui_2_clean']}','{x['cui_3_clean']}'] </s> {x['result']}", axis = 1)
df['target'] = df.apply(lambda x: f"[['{x['source_clean']}', '{x['relation 1']}', '{x['cui_2_clean']}'], ['{x['cui_2_clean']}', '{x['relation 2']}', '{x['cui_3_clean']}']]", axis = 1)

In [9]:
df['rel_1_clean'] = df['relation 1'].apply(lambda x: get_point(x))
df['rel_2_clean'] = df['relation 2'].apply(lambda x: get_point(x))
df['rel_combine'] = df.apply(lambda x: ' '.join([x['rel_1_clean'], x['rel_2_clean']]), axis = 1)

In [None]:
from langdetect import detect
# Assuming you have a DataFrame df with a column 'text' containing text data
non_english_rows = []

for index, row in df.iterrows():
    text = row['source']
    try:
        language = detect(text)
        # Change 'en' to your desired language code for English
        if language != 'en':
            non_english_rows.append(index)
    except:
        # Handle cases where language detection fails (e.g., due to short text)
        pass

In [26]:
df = df[~df.index.isin(non_english_rows)].reset_index(drop = True)

In [27]:
# see which can be stratified
split_dict = dict(df.rel_combine.value_counts() > 1)
df['strat_split'] = df.rel_combine.apply(lambda x: split_dict[x])

# separate dfs
df_str = df[df.strat_split == True].reset_index(drop = True)
df_nostr = df[df.strat_split == False].reset_index(drop = True)

from sklearn.model_selection import train_test_split
# Specify the feature (topic) column and the target variable (text) column.
feature_column = 'rel_combine'
text_column = 'result'

# Adjust the test_size and random_state as needed.
X_train, X_temp, y_train, y_temp = train_test_split(
    df_str[text_column],
    df_str[feature_column],
    test_size=0.2,  # Adjust the test size as needed.
    stratify=df_str[feature_column], # stratify by column
    random_state=42  # Adjust the random seed as needed.
)

# Split the remaining data into validation and testing sets (50% each).
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42  # Use the same random seed for consistency.
)

df_train1 = df_str[df_str.index.isin(X_train.index)]
df_val1 = df_str[df_str.index.isin(X_val.index)]
df_test1 = df_str[df_str.index.isin(X_test.index)]

# Adjust the test_size and random_state as needed.
X_train, X_temp, y_train, y_temp = train_test_split(
    df_nostr[text_column],
    df_nostr[feature_column],
    test_size=0.2,  # Adjust the test size as needed.
    random_state=42  # Adjust the random seed as needed.
)

# Split the remaining data into validation and testing sets (50% each).
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42  # Use the same random seed for consistency.
)

df_train2 = df_nostr[df_nostr.index.isin(X_train.index)]
df_val2 = df_nostr[df_nostr.index.isin(X_val.index)]
df_test2 = df_nostr[df_nostr.index.isin(X_test.index)]

df_train = pd.concat([df_train1,df_train2])
df_test = pd.concat([df_test1,df_test2])
df_val = pd.concat([df_val1,df_val2])


In [28]:
df_train = pd.concat([df_train1,df_train2])
df_test = pd.concat([df_test1,df_test2])
df_val = pd.concat([df_val1,df_val2])

### Export to JSON

In [29]:
df_train.shape

(10706, 23)

In [31]:
import json

train_json = df_train.reset_index().loc[:1000][['source','target']]
val_json = df_val.reset_index().loc[:100][['source','target']]
test_json = df_test.reset_index().loc[:100][['source','target']]

data_dir = '../data/modelling/'

train_file = os.path.join(data_dir,'strain_v1.json')
val_file = os.path.join(data_dir,'sval_v1.json')
test_file = os.path.join(data_dir,'stest_v1.json')


with open(train_file, 'w', encoding = 'utf-8') as train_json_file:
    # Iterate over the rows of the DataFrame
    for index, row in train_json.iterrows():
        # Convert the row to a dictionary and write it to the file
        json.dump(row.to_dict(), train_json_file, ensure_ascii = False)
        # Add a newline character to separate the JSON objects
        train_json_file.write('\n')

with open(val_file, 'w', encoding = 'utf-8') as val_json_file:
    # Iterate over the rows of the DataFrame
    for index, row in val_json.iterrows():
        # Convert the row to a dictionary and write it to the file
        json.dump(row.to_dict(), val_json_file, ensure_ascii = False)
        # Add a newline character to separate the JSON objects
        val_json_file.write('\n')

with open(test_file, 'w', encoding = 'utf-8') as test_json_file:
    # Iterate over the rows of the DataFrame
    for index, row in test_json.iterrows():
        # Convert the row to a dictionary and write it to the file
        json.dump(row.to_dict(), test_json_file, ensure_ascii = False)
        # Add a newline character to separate the JSON objects
        test_json_file.write('\n')

# with open(train_file, 'w') as train_json_file:
#     json.dump(train_json, train_json_file)

# with open(val_file, 'w') as val_json_file:
#     json.dump(val_json, val_json_file)

# with open(test_file, 'w') as test_json_file:
#     json.dump(test_json, test_json_file)

