In [1]:
import pandas as pd
import random

In [2]:
# Import groups
tm = pd.read_csv('Step3Output_with_group.csv',
                 sep=',',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

tm = tm[['RecordID', 'group']]

In [3]:
# Import metainventory
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/code/Step3Output.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [4]:
# Merge
df = pd.merge(df, tm, on='RecordID')
df.shape

(405543, 19)

In [5]:
df.head(5)

Unnamed: 0,GroupID,RecordID,SF,SFUI,NormSF,LF,LFUI,NormLF,Source,SFEUI,LFEUI,Type,PrefSF,Score,Count,Frequency,UMLS.CUI,Modified,group
0,,R000001,AA,S003081,aa,achievement age,L037913,achievement age,UMLS,E0000048,E0006859,acronym,,,,,,,0.0
1,,R000002,AA,S003081,aa,Alcoholics Anonymous,L004250,,UMLS,E0000048,E0000204,acronym,,,,,,,60326.0
2,,R000003,AA,S003081,aa,alcohol abuse,L040702,alcohol abuse,UMLS,E0000048,E0356324,acronym,,,,,,,781.0
3,,R000004,AA,S003081,aa,alcohol-abuse,L040752,alcohol abuse,UMLS,E0000048,E0356324,acronym,,,,,,,781.0
4,,R000005,AA,S003081,aa,aortic aneurysm,L045559,aortic aneurysm,UMLS,E0000048,E0009858,acronym,,,,,,,60327.0


In [6]:
# Sort
df = df.sort_values(by=['group', 'SFUI'])
df = df.reset_index(drop=True)

# Assign group ID
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['GroupID'].iat[index] = assignment
    elif df['group'].at[index] == '0.0':
        assignment += 1
        df['GroupID'].iat[index] = assignment
    elif ((df['group'].at[index] == df['group'].at[index-1]) and (df['SFUI'].at[index] == df['SFUI'].at[index-1])):
        df['GroupID'].iat[index] = assignment
    else:
        assignment += 1
        df['GroupID'].iat[index] = assignment

In [7]:
# Format group ID
df['GroupID'] = 'G' + (df.GroupID.map('{:06}'.format))
df.head(5)

Unnamed: 0,GroupID,RecordID,SF,SFUI,NormSF,LF,LFUI,NormLF,Source,SFEUI,LFEUI,Type,PrefSF,Score,Count,Frequency,UMLS.CUI,Modified,group
0,G000001,R294485,$Can,S000001,_can,Canadian dollars,L008588,,ADAM,,,,$Can,0.8365,18.0,,,,0.0
1,G000002,R389142,%,S000002,_,percent,L125893,percent,Berman,,,,,,,,,,0.0
2,G000003,R294486,(*)NO,S000004,___no,Nitric oxide,L024705,nitric oxide,ADAM,,,,(*)NO,0.8715,23.0,,,,0.0
3,G000004,R294487,(*)OH,S000005,___oh,hydroxyl radical,L091545,hydroxyl radical,ADAM,,,,(*)OH,0.6386,15.0,,,,0.0
4,G000005,R294488,(+)-MCPG,S000006,____mcpg,(+)-alpha-methyl-4-carboxyphenylglycine,L000005,,ADAM,,,,(+)-MCPG,0.9,10.0,,,,0.0


In [None]:
### Create subset for cross-mapping review

# Sort
df = df.sort_values(by=['GroupID', 'Source'])
df = df.reset_index(drop=True)
df.head(5)

# Remove groups with one entry
temp = df[df['group']!="0.0"]
temp = temp.reset_index(drop=True)

# Remove groups with only UMLS
sources = []
remove = []

for index, row in temp.iterrows():
    if ((index == 0) or (temp['GroupID'].at[index] == temp['GroupID'].at[index-1])):
        sources.append(temp['Source'].at[index])
    else:
        if set(sources) == {'UMLS'}:
            remove.append(temp['GroupID'].at[index-1])
        sources = []
        sources.append(temp['Source'].at[index])
        
temp = temp[~temp['GroupID'].isin(remove)]

# Get random subsample
groups = set(temp.GroupID.unique())
n = int(len(groups) * 0.05)
print(n)
random.seed(42)
groups = random.sample(groups, n)
temp = temp[temp['GroupID'].isin(groups)]

# Drop extra columns
temp = temp.drop(columns=['SFEUI', 'Type', 'PrefSF', 'Score', 'Count', 'Frequency', 
                          'UMLS.CUI', 'NormSF', 'NormLF', 'Modified', 'group'])

# Add Column
temp["Good"] = "1"
temp.head()

# Export subset
temp.to_csv('GroupReview.csv',
            index=False,
            header=True,
            sep='|')

In [8]:
# Sort
df = df.sort_values(by=['RecordID'])
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,GroupID,RecordID,SF,SFUI,NormSF,LF,LFUI,NormLF,Source,SFEUI,LFEUI,Type,PrefSF,Score,Count,Frequency,UMLS.CUI,Modified,group
0,G002754,R000001,AA,S003081,aa,achievement age,L037913,achievement age,UMLS,E0000048,E0006859,acronym,,,,,,,0.0
1,G159189,R000002,AA,S003081,aa,Alcoholics Anonymous,L004250,,UMLS,E0000048,E0000204,acronym,,,,,,,60326.0
2,G178751,R000003,AA,S003081,aa,alcohol abuse,L040702,alcohol abuse,UMLS,E0000048,E0356324,acronym,,,,,,,781.0
3,G178751,R000004,AA,S003081,aa,alcohol-abuse,L040752,alcohol abuse,UMLS,E0000048,E0356324,acronym,,,,,,,781.0
4,G159190,R000005,AA,S003081,aa,aortic aneurysm,L045559,aortic aneurysm,UMLS,E0000048,E0009858,acronym,,,,,,,60327.0


In [9]:
# Drop extra column
df = df.drop(columns=['group'])
df.shape

(405543, 18)

In [10]:
# Export
df.to_csv('Step4Output.csv',
          index=False,
          header=True,
          sep='|')

In [11]:
# Export
df.to_csv('MetainventoryAuxiliary_Version1.0.csv',
          index=False,
          header=True,
          sep='|')

In [12]:
# Drop extra columns
df = df.drop(columns=['SFEUI', 'LFEUI', 'Type', 'PrefSF', 'Score', 'Count', 'Frequency', 'UMLS.CUI'])
df.shape

(405543, 10)

In [13]:
df.head(1)

Unnamed: 0,GroupID,RecordID,SF,SFUI,NormSF,LF,LFUI,NormLF,Source,Modified
0,G002754,R000001,AA,S003081,aa,achievement age,L037913,achievement age,UMLS,


In [14]:
# Export
df.to_csv('Metainventory_Version1.0.csv',
          index=False,
          header=True,
          sep='|')