In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

## 1. Merge files 

In [None]:
zip_folder = './data'
zip_files = [f for f in os.listdir(zip_folder) if f.endswith('.zip')]

data_frames = {
    'DRUG': [],
    'DEMO': [],
    'THER': [],
    'REAC': [],
    'OUTC': [],
    'INDI': [],
    'RPSR': []
}

file_records = {}

for zip_file in tqdm(zip_files, desc='Processing zip files'):
    with zipfile.ZipFile(os.path.join(zip_folder, zip_file), 'r') as zf:
        
        file_contents = {name.lower(): zf.open(name, 'r') for name in zf.namelist() if name.endswith('.txt')}

        
        for key in data_frames.keys():
            relevant_files = [content for name, content in file_contents.items() if key.lower() in name]
            if relevant_files:
                
                file_dfs = [pd.read_table(file, sep='$', low_memory=False, encoding='ISO-8859-1') for file in relevant_files]
                data_frames[key].append(pd.concat(file_dfs))

                
                if key in file_records:
                    file_records[key].extend([(file.name, len(df)) for file, df in zip(relevant_files, file_dfs)])
                else:
                    file_records[key] = [(file.name, len(df)) for file, df in zip(relevant_files, file_dfs)]

        
        for file in file_contents.values():
            file.close()


drug = pd.concat(data_frames['DRUG'], ignore_index=True)
demo = pd.concat(data_frames['DEMO'], ignore_index=True)
ther = pd.concat(data_frames['THER'], ignore_index=True)
reac = pd.concat(data_frames['REAC'], ignore_index=True)
outc = pd.concat(data_frames['OUTC'], ignore_index=True)
indi = pd.concat(data_frames['INDI'], ignore_index=True)
rpsr = pd.concat(data_frames['RPSR'], ignore_index=True)


del data_frames

In [None]:
table_data = []

for key, records in file_records.items():
    for file_name, record_count in records:
        table_data.append([key, file_name, record_count])

table = pd.DataFrame(table_data, columns=['group', 'filename', 'record'])
#table.to_csv('./merge_statistics.csv',index=False)

In [None]:
summary_table = table.groupby('group')['record'].sum().reset_index()
#summary_table

## Delete

In [None]:
deleted_case_ids = []
deleted_case_counts = {}

for zip_file in tqdm(zip_files, desc='Processing ZIP files'):
    with zipfile.ZipFile(os.path.join(zip_folder, zip_file), 'r') as zf:

        folder_names = [name.lower() for name in zf.namelist() if name.endswith('/')]
        if any('delete' in folder_name for folder_name in folder_names):

            for file_name in zf.namelist():
                if file_name.lower().startswith('deleted/'):
                    with zf.open(file_name, 'r') as txt_file:
                        case_ids = [line.decode('utf-8').strip() for line in txt_file]
                        deleted_case_ids.extend(case_ids)


            deleted_case_counts[zip_file] = len(case_ids)


deleted_case_ids = list(set(deleted_case_ids))

print("\remove case ID:")
for zip_file, count in deleted_case_counts.items():
    print(f"ZIP file: {zip_file}, remove case IDs: {count}")

In [None]:
#pickle.dump(deleted_case_ids,open('./deleted_case_ids,pkl','wb'))

## 2. Deduplication
### 2-1. retain last report

In [None]:
demo.sort_values(by=['caseid','caseversion'],ascending=False,inplace=True)
demo.reset_index(drop=True, inplace=True)

In [None]:
print(demo.shape[0])
print(demo['caseid'].nunique())

In [None]:
demo.drop_duplicates(subset=['caseid'],keep='first',inplace=True)
demo.reset_index(drop=True,inplace=True)
print(demo.shape[0])
print(demo['caseid'].nunique())

In [None]:
temp = demo['caseid'].nunique()

### 2-2. Remove erroneous cases

In [None]:
int_converted_deleted_case_ids = [int(caseid) for caseid in deleted_case_ids if caseid]

In [None]:
temp = len(int_converted_deleted_case_ids)

In [None]:
temp = demo.query('caseid in @int_converted_deleted_case_ids').shape[0]

In [None]:
demo = demo.query('caseid not in @int_converted_deleted_case_ids').reset_index(drop=True)

In [None]:
temp = demo['caseid'].nunique()

### 2-3. deduplication
- the same reporting country, gender, event date, age, adverse events, and drugs prescribed

#### 2-3-1. Age conversion

In [None]:
demo.shape

In [None]:
demo = demo[pd.to_numeric(demo['age'], errors='coerce').notnull()].reset_index(drop=True)

conditions = [
    demo['age_cod'].eq('DEC'), demo['age_cod'].eq('YR'),demo['age_cod'].eq('YEAR'), demo['age_cod'].eq('MON'), 
    demo['age_cod'].eq('WK'), demo['age_cod'].eq('WEEK'), demo['age_cod'].eq('DY'), demo['age_cod'].eq('DAY'),
    demo['age_cod'].eq('HR'), demo['age_cod'].eq('HOUR')
]
choices = [
    np.where(demo['age'].astype(float) <= 10, np.round(demo['age'].astype(float)*10, 2), np.round(demo['age'], 2)), # DEC
    np.round(demo['age'], 2), np.round(demo['age'], 2), 
    np.round(demo['age'].astype(float)/12, 2), np.round(demo['age'].astype(float)/52, 2), np.round(demo['age'].astype(float)/52, 2), 
    np.round(demo['age'].astype(float)/365, 2), np.round(demo['age'].astype(float)/365, 2), np.round(demo['age'].astype(float)/8760, 2),
    np.round(demo['age'].astype(float)/8760, 2)
]
demo['age_yr'] = np.select(conditions, choices, default=demo['age'])

In [None]:
temp = demo['caseid'].nunique()

In [None]:
demo = demo.query('age_yr > 0').reset_index(drop=True)

In [None]:
Q1 = demo['age_yr'].quantile(0.25)
Q3 = demo['age_yr'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
print(lower_bound)
upper_bound = Q3 + 1.5 * IQR
print(upper_bound)

demo = demo.query('age_yr >= @lower_bound and age_yr <= @upper_bound').reset_index(drop=True)

In [None]:
temp = demo['caseid'].nunique()

In [None]:
pid_set = set(demo['primaryid'])

drug = drug.query('primaryid in @pid_set').reset_index(drop=True)
ther = ther.query('primaryid in @pid_set').reset_index(drop=True)
reac = reac.query('primaryid in @pid_set').reset_index(drop=True)
outc = outc.query('primaryid in @pid_set').reset_index(drop=True)
indi = indi.query('primaryid in @pid_set').reset_index(drop=True)
rpsr = rpsr.query('primaryid in @pid_set').reset_index(drop=True)

### 2-3-2. deduplication
- the same reporting country, gender, event date, age, adverse events, and drugs prescribed

In [None]:
pt_to_int_dict = {pt: idx + 1 for idx, pt in enumerate(reac['pt'].unique())}

def map_pt_to_int(group):
    return [pt_to_int_dict.get(pt, 0) for pt in group]

reac_grouped = reac.groupby('caseid')['pt'].apply(map_pt_to_int).reset_index()
reac_grouped.columns = ['caseid', 'pt_int_values']

reac_grouped['pt_int_values'] = reac_grouped['pt_int_values'].apply(lambda lst: sorted(set(lst)))

In [None]:
drug['new_drug'] = drug['drugname'].fillna(drug['prod_ai']).astype(str)

drugname_to_int_dict = {new_drug: idx + 1 for idx, new_drug in enumerate(drug['new_drug'].unique())}
def map_drug_to_int(group):
    return [drugname_to_int_dict.get(new_drug, 0) for new_drug in group]

drug_grouped = drug.groupby('caseid')['new_drug'].apply(map_drug_to_int).reset_index()
drug_grouped.columns = ['caseid', 'drug_int_values']

drug_grouped['drug_int_values'] = drug_grouped['drug_int_values'].apply(lambda lst: sorted(set(lst)))

In [None]:
df_merged = pd.merge(demo[['primaryid','caseid','age_yr','reporter_country','sex','event_dt']], drug_grouped, on='caseid')
df_merged = pd.merge(df_merged, reac_grouped, on='caseid')

In [None]:
def list_to_string(lst):
    return ', '.join(map(str, lst))

df_merged['drug_int_values'] = df_merged['drug_int_values'].apply(list_to_string)
df_merged['pt_int_values'] = df_merged['pt_int_values'].apply(list_to_string)

In [None]:
df_merged['reporter_country'].fillna('NAN', inplace=True)
df_merged['sex'].fillna('NAN', inplace=True)
df_merged['event_dt'].fillna(99999999, inplace=True)

In [None]:
grouped_df = df_merged.groupby(['age_yr', 'reporter_country', 'sex', 'event_dt', 'drug_int_values', 'pt_int_values'])['caseid'].agg(list).reset_index()
duplicated_combinations = grouped_df[grouped_df['caseid'].apply(len) > 1]

In [None]:
duplicated_combinations['max_caseid'] = duplicated_combinations['caseid'].apply(max)
caseids_to_delete = []

for index, row in duplicated_combinations.iterrows():
    caseids = row['caseid']
    max_caseid = row['max_caseid']
    remaining_caseids = [caseid for caseid in caseids if caseid != max_caseid]
    caseids_to_delete.extend(remaining_caseids)

In [None]:
demo = demo.query('caseid not in @caseids_to_delete').reset_index(drop=True)

In [None]:
pid_set = set(demo['primaryid'])

drug = drug.query('primaryid in @pid_set').reset_index(drop=True)
ther = ther.query('primaryid in @pid_set').reset_index(drop=True)
reac = reac.query('primaryid in @pid_set').reset_index(drop=True)
outc = outc.query('primaryid in @pid_set').reset_index(drop=True)
indi = indi.query('primaryid in @pid_set').reset_index(drop=True)
rpsr = rpsr.query('primaryid in @pid_set').reset_index(drop=True)

In [None]:
temp = demo['caseid'].nunique()

### save

In [None]:
demo.to_csv('./clean_step1_demo.csv.gz',index=False)
drug.to_csv('./clean_step1_drug.csv.gz',index=False)
indi.to_csv('./clean_step1_indi.csv.gz',index=False)
outc.to_csv('./clean_step1_outc.csv.gz',index=False)
reac.to_csv('./clean_step1_reac.csv.gz',index=False)
rpsr.to_csv('./clean_step1_rpsr.csv.gz',index=False)
ther.to_csv('./clean_step1_ther.csv.gz',index=False)

#### key refs: https://github.com/Judenpech/FAERS-data-toolkit/blob/master/faersPreprocess.py