# Annotate the leftover variants from Rafique table using PubMed 

In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
import json
import time
import ast

## 1. Select the unannotated variants from the annotated Rafique table

In [2]:
Rafique_annotated = pd.read_csv('Rafique_with_rs.csv', 
                                converters={i: str for i in range(11)}, low_memory=False)
Rafique_annotated

Unnamed: 0,Gene,Nucleotide position,Protein position,Accession number,Gnom AD frequency,Country,Publication Year,Reference,ACMG\n (Intervar)*,transcript_stable_id,ref_protein_seq,NM_acc,DNA_coords,ensembl_id
0,GCK,c.908G>T,p.Arg303Leu,NM_001354800.1\n NP_001341729.1\n (25921421),,Greece,2015,-1,LP,,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_001354800.1,NC_000007.14:g.44146574C>A,rs1312678560
1,GCK,c.748C>T,p.Arg250Cys,NP_000153.1\n (17204055),,Serbia,2006,-2,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
2,GCK,c.182G>A,p.T61I,NP_000153.1\n (8433729),,Spain,2000,-3,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
3,GCK,c.358C>T,p.A120T,,,Spain,,-3,VUS,,,,,
4,GCK,c.238delT,M238fsdelT,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1012,HNF1A,c.1512C>A,p.Ser504Arg,NM_000545.6\n NP_000536.5,7.97E-06,China,2020,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120999278C>A,rs944413465
1013,HNF1A,c.956-1G>C,,NM_000545.6\n NP_000536.5,,,,-238,,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996261G>C,
1014,HNF1A,c. 347C>T,p.Ala116Val,NM_000545.6\n NP_000536.5\n (11315828),3.98E-06,,,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120988853C>T,rs752886203
1015,HNF1A,c.1192C>G,p.Gln398Glu,NM_000545.6\n NP_000536.5,,,,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996625C>G,


In [3]:
ids_without_nan = []
for item in Rafique_annotated['ensembl_id']:
    if str(item).startswith('rs'): 
        ids_without_nan.append(item)
    else: 
        item = ''
        ids_without_nan.append(item)  
Rafique_annotated['ensembl_id'] = ids_without_nan

In [4]:
leftover_var_Rafique = Rafique_annotated[Rafique_annotated['ensembl_id'] == ''].reset_index(drop=True)
leftover_var_Rafique

Unnamed: 0,Gene,Nucleotide position,Protein position,Accession number,Gnom AD frequency,Country,Publication Year,Reference,ACMG\n (Intervar)*,transcript_stable_id,ref_protein_seq,NM_acc,DNA_coords,ensembl_id
0,GCK,c.748C>T,p.Arg250Cys,NP_000153.1\n (17204055),,Serbia,2006,-2,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
1,GCK,c.182G>A,p.T61I,NP_000153.1\n (8433729),,Spain,2000,-3,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
2,GCK,c.358C>T,p.A120T,,,Spain,,-3,VUS,,,,,
3,GCK,c.238delT,M238fsdelT,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
4,GCK,c.226deltinsAA,V226fsdelTinsAA,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,ABCC8,,p.Arg826Trp,NP_001274103.1,,Argentina,2016,-236,,,MPLAFCGSENHSAAYRVDQGVLNNGCFVDALNVVPHVFLLFITFPI...,NM_000352.4,,
808,HNF1A,HNF1α ex2-3del mutation,,,,New Zealand,2013,-237,,,,,,
809,HNF1A,c.956-1G>C,,NM_000545.6\n NP_000536.5,,,,-238,,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996261G>C,
810,HNF1A,c.1192C>G,p.Gln398Glu,NM_000545.6\n NP_000536.5,,,,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996625C>G,


In [5]:
leftover_var_Rafique.to_csv(
    'Rafique_without_rs.csv', index=False, header=True)

In [6]:
#make a list of references to extract PMIDS

ref_list = []
for ref in leftover_var_Rafique['Reference']:
    if '-' in ref:
        norm_ref = ref[1:]
        ref_list.append(norm_ref)
    if '(' in ref:
        for x in re.findall('[0-9]+', ref):
            ref_list.append(x)

left_refs_int = [eval(i) for i in list(set(ref_list))]
left_refs_int

[216,
 113,
 167,
 239,
 62,
 218,
 18,
 203,
 26,
 51,
 235,
 109,
 154,
 64,
 158,
 170,
 238,
 134,
 9,
 68,
 69,
 126,
 159,
 90,
 36,
 201,
 44,
 206,
 31,
 99,
 124,
 49,
 140,
 202,
 219,
 37,
 135,
 20,
 23,
 101,
 112,
 160,
 186,
 5,
 10,
 48,
 81,
 122,
 83,
 130,
 165,
 87,
 116,
 237,
 196,
 151,
 150,
 75,
 146,
 30,
 22,
 106,
 131,
 60,
 27,
 98,
 148,
 115,
 89,
 111,
 125,
 157,
 15,
 217,
 45,
 226,
 228,
 189,
 80,
 136,
 143,
 102,
 2,
 232,
 163,
 199,
 59,
 50,
 149,
 168,
 21,
 25,
 6,
 152,
 107,
 204,
 207,
 220,
 153,
 198,
 33,
 110,
 175,
 225,
 52,
 114,
 139,
 3,
 53,
 28,
 195,
 12,
 178,
 215,
 227,
 94,
 29,
 93,
 16,
 145,
 103,
 187,
 85,
 67,
 61,
 35,
 74,
 97,
 100,
 104,
 182,
 147,
 4,
 14,
 211,
 120,
 77,
 123,
 129,
 171,
 197,
 58,
 223,
 88,
 128,
 166,
 183,
 76,
 55,
 95,
 92,
 156,
 78,
 121,
 46,
 191,
 8,
 144,
 132,
 38,
 188,
 47,
 57,
 108,
 233,
 221,
 7,
 234,
 40,
 200,
 24,
 19,
 172,
 70,
 142,
 72,
 164,
 32,
 63,
 82,
 236,
 

In [7]:
pd.DataFrame(left_refs_int).to_csv(
    'leftover_refs.csv', index=False, header=True)

## 2. Take the bibliography from Rafique and extract paper titles

Take the bibliography from Supplementary 2, it is not the same as in the paper!
Create a dataframe with tites and their numbers in bibliography to match to the numbers in the supplimentary table

In [8]:
bibliography = open(
    'input/references_Rafique.txt').readlines()
clean_bibliography = []
for line in bibliography:
    clean_bibliography.append(line.replace('[', '').replace(']', '').replace('?', '.').replace('!', '.'))
columns = ['number', 'title']
df_data = []
for line in clean_bibliography:
    number = line.split('.')[0]
    title = ' '.join(line.split('.')[2:-3])
    df_data.append([number, title])
number_titles = pd.DataFrame(data=df_data, columns=columns)
number_titles

Unnamed: 0,number,title
0,1,A novel heterozygous mutation in the glucokin...
1,2,Novel glucokinase mutation in a boy with matu...
2,3,Genetic and clinical characterisation of matu...
3,4,MODY 2: mutation identification and molecular...
4,5,Molecular diagnosis of maturity-onset diabete...
...,...,...
234,235,17q12 Deletion Syndrome as a Rare Cause for D...
235,236,Clinical and genetic features of Argentinian ...
236,237,Primary hepatocellular neoplasms in a MODY3 f...
237,238,High Prevalence of a Monogenic Cause in Han C...


In [9]:
number_titles.to_csv(
    'whole_pipeline_311022/bibliography_df.csv', index=False, header=True)

Query PubMed API to get PMIDs based og the titles

In [10]:
result_dict = {'number':[], 'PMID':[]}
#results_list = []
db = 'pubmed'
domain = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
nresults = 10
for index,row in number_titles.iterrows():
    query = row['title']
    number = row['number'] #pass the numbers to the results to know where is what
    retmode='json'
    # standard query
    queryLinkSearch = f'{domain}/esearch.fcgi?db={db}&retmax={nresults}&retmode={retmode}&term={query}'
    response = requests.get(queryLinkSearch)
    
    #extract the idlists and add them to the dataframe along with the numbers
    result_json = response.json()
    
    PMID = result_json['esearchresult']['idlist']   
    result_dict['number'].append(number)
    result_dict['PMID'].append(PMID)

    time.sleep(1)
    print(number)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239


In [11]:
PMIDs = pd.DataFrame(result_dict)
PMIDs['title'] = number_titles['title']
PMIDs

Unnamed: 0,number,PMID,title
0,1,[25921421],A novel heterozygous mutation in the glucokin...
1,2,[],Novel glucokinase mutation in a boy with matu...
2,3,[10754480],Genetic and clinical characterisation of matu...
3,4,[],MODY 2: mutation identification and molecular...
4,5,"[33565752, 27256595, 26669242, 26226118]",Molecular diagnosis of maturity-onset diabete...
...,...,...,...
234,235,[30032214],17q12 Deletion Syndrome as a Rare Cause for D...
235,236,[27329029],Clinical and genetic features of Argentinian ...
236,237,[],Primary hepatocellular neoplasms in a MODY3 f...
237,238,[],High Prevalence of a Monogenic Cause in Han C...


In [12]:
PMIDs.to_csv('PMIDs_queried.csv', index=False, header=True)

Unfortunately, a lot of IDs have not been sucsessfully fetched, so one needs to look them up and add them manually.

In [13]:
PMIDs_curated = pd.read_csv('input/PMIDs_curated.csv')
justIDs = []
for i in PMIDs_curated['PMID']:
    justIDs.append(i[2:-2])
PMIDs_curated['PMID'] = justIDs
PMIDs_curated

Unnamed: 0,number,PMID,title
0,1,25921421,A novel heterozygous mutation in the glucokina...
1,2,19069349,Novel glucokinase mutation in a boy with matur...
2,3,10754480,Genetic and clinical characterisation of matur...
3,4,23085272,MODY 2: mutation identification and molecular ...
4,5,26226118,Molecular diagnosis of maturity-onset diabetes...
...,...,...,...
234,235,30032214,17q12 Deletion Syndrome as a Rare Cause for Di...
235,236,27329029,Clinical and genetic features of Argentinian c...
236,237,23707370,Primary hepatocellular neoplasms in a MODY3 fa...
237,238,31658956,High Prevalence of a Monogenic Cause in Han Ch...


In [14]:
# filter PMIDs to the ones left over from the mapping
leftover_IDs = PMIDs_curated.query('number in @left_refs_int').reset_index(drop=True)
leftover_IDs

Unnamed: 0,number,PMID,title
0,2,19069349,Novel glucokinase mutation in a boy with matur...
1,3,10754480,Genetic and clinical characterisation of matur...
2,4,23085272,MODY 2: mutation identification and molecular ...
3,5,26226118,Molecular diagnosis of maturity-onset diabetes...
4,6,28726111,Glucokinase mutations in pediatric patients wi...
...,...,...,...
203,235,30032214,17q12 Deletion Syndrome as a Rare Cause for Di...
204,236,27329029,Clinical and genetic features of Argentinian c...
205,237,23707370,Primary hepatocellular neoplasms in a MODY3 fa...
206,238,31658956,High Prevalence of a Monogenic Cause in Han Ch...


Fetch variants with Ensembl API: takes PMID and returns rs identifiers of the variants reported in those papers

In [15]:
bad_IDs = []
PMID_mapping = []
passed_IDs = []

In [16]:
server = "https://rest.ensembl.org"

for ID in leftover_IDs['PMID']:
    if ID not in passed_IDs:
        ext = "/variation/human/pmid/" + str(ID) + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print("bad" + str(ID))
            bad_IDs.append(ID)
            continue
        var_decoded = r.json()
        rs_list = []
        for variant in var_decoded:
            rs_list.append(variant['name'])
        PMID_mapping.append({'PMID': ID, 'rs': rs_list})
        time.sleep(1)
        print(ID)

19069349
10754480
23085272
26226118
28726111
27634015
bad27289208
29056535
20132997
24735133
22335469
22332836
17937063
31604004
17573900
23433541
28012402
21978167
10694920
23624530
15305805
bad14633836
25555642
19410318
16632067
15928245
8168652
24804978
bad15216446
10447526
25174781
30257192
bad20015564
28575730
1303265
12955723
28323911
18811724
20337973
27185633
12050210
bad29408271
18399931
bad2817007
26669242
bad24405491
30977832
30447144
23295287
15841481
9049484
bad30086370
31216263
29207974
22611063
27256595
22761713
26059258
31595705
bad31028668
bad19411616
30259503
19564454
18271687
bad30534894
bad9713013
bad23724189
23771925
25082184
31576961
16731834
16059790
bad20587714
27271189
18382660
22493702
30592380
bad24355479
29510678
30455330
21168233
19929997
25041077
bad26050565
9754819
18513305
bad10980542
28395978
28862987
15031772
24905847
bad23679181
bad10872540
27913849
23616187
12574234
bad23009393
bad18433912
bad27398945
21683639
9075818
bad15001636
9313763
27810688
bad

In [17]:
PMID_mapping_df = pd.DataFrame(PMID_mapping)
PMID_mapping_df

Unnamed: 0,PMID,rs
0,19069349,[rs1057524904]
1,10754480,"[rs587776825, rs587780345, rs1172328722]"
2,23085272,"[rs142952813, rs193922329]"
3,26226118,"[rs144425830, rs1372204515]"
4,28726111,"[rs193922300, rs267601516, rs1064793998, rs106..."
...,...,...
124,22060211,"[rs104894009, rs193922480, rs193922287, rs1939..."
125,29927023,"[rs121918675, rs137853236, rs193922311, rs1255..."
126,8433729,"[rs104894006, rs142829768, rs193922272, rs2020..."
127,31658956,"[rs567563179, rs587776825]"


In [18]:
PMID_mapping_df.to_csv(
    'extracted_rs_with_PMIDs_Rafique.csv',
    header=True, index=False)

In [19]:
#list of extracted rs
rs_list = []
for item in PMID_mapping_df['rs']:
    for rs in item:
        rs_list.append(rs)
len(rs_list)

683

Map the extracted variants to the reference Ensembl table

In [20]:
ref_Ens = pd.read_csv(
    'Ens_filtered_all_alleles_location_coord_no_duplicates.csv'
    , converters={'alleles': ast.literal_eval}, low_memory=False)

In [21]:
# Filtering the Ensembl table to only those variants
mapped_variants = ref_Ens.drop_duplicates().query('id in @rs_list').reset_index(drop=True)
mapped_variants

Unnamed: 0,id,seq_region_name,start,end,strand,vf_allele,Location,coordinate,Gene,Transcript,Exon
0,rs113651985,4,6289120.0,6289120.0,1.0,T,4:6289120,4:6289120:C>T,ENSG00000109501,ENST00000503569,ENSE00000701011
1,rs567563179,4,6291208.0,6291208.0,1.0,A,4:6291208,4:6291208:G>A,ENSG00000109501,ENST00000503569,ENSE00003648875
2,rs148953711,4,6291241.0,6291241.0,1.0,A,4:6291241,4:6291241:G>A,ENSG00000109501,ENST00000503569,ENSE00003648875
3,rs774525063,4,6291971.0,6291971.0,1.0,C,4:6291971,4:6291971:T>C,ENSG00000109501,ENST00000503569,ENSE00003689853
4,rs142671083,4,6300919.0,6300919.0,1.0,A,4:6300919,4:6300919:G>A,ENSG00000109501,ENST00000503569,ENSE00002061224
...,...,...,...,...,...,...,...,...,...,...,...
1427,rs779736828,11,17413396.0,17413396.0,1.0,T,11:17413396,11:17413396:G>T,ENSG00000006071,ENST00000526921,ENSE00002195815
1428,rs72559730,11,17461663.0,17461663.0,1.0,T,11:17461663,11:17461663:G>T,ENSG00000006071,ENST00000683253,ENSE00003917065
1429,rs72559730,11,17461663.0,17461663.0,1.0,T,11:17461663,11:17461663:G>T,ENSG00000006071,ENST00000646737,ENSE00003816790
1430,rs72559730,11,17461663.0,17461663.0,1.0,T,11:17461663,11:17461663:G>T,ENSG00000006071,ENST00000684221,ENSE00003919812


In [22]:
# Writing to file specifying that it is the 2nd stage of annotation
mapped_variants.to_csv(
    'Rafique_mapped_to_Ens_2nd.csv',
    header=True, index=False)

## 3. Dealing with bad IDs

Which PMIDs did not return any variants and which references are this

In [23]:
bad_IDs

['27289208',
 '14633836',
 '15216446',
 '20015564',
 '29408271',
 '2817007',
 '24405491',
 '30086370',
 '31028668',
 '19411616',
 '30534894',
 '9713013',
 '23724189',
 '20587714',
 '24355479',
 '26050565',
 '10980542',
 '23679181',
 '10872540',
 '23009393',
 '18433912',
 '27398945',
 '15001636',
 '26822262',
 '29222740',
 '23271932',
 '27142837',
 '293717',
 '21437455',
 '9920109',
 '9243109',
 '30730840',
 '28597946',
 '24299156',
 '31523701',
 '16731861',
 '20003313',
 '26676964',
 '23480312',
 '25367728',
 '28502589',
 '28593362',
 '28680642',
 '29491316',
 '27114981',
 '29633446',
 '25721872',
 '30182532',
 '30734462',
 '30656436',
 '29726111',
 '28436541',
 '28664602',
 '26773576',
 '31578821',
 '25951767',
 '21263211',
 '28993341',
 '31124255',
 '31066763',
 '29412391',
 '29264522',
 '18268044',
 '27538677',
 '894547',
 '31276222',
 '24447076',
 '29163993',
 '31500578',
 '31197960',
 '31391355',
 '29576871',
 '11078452',
 '22587559',
 '17828387',
 '19417042',
 '30032214',
 '27329

In [24]:
bad_ref = []
for index,row in leftover_IDs.iterrows():
    if row['PMID'] in bad_IDs:
        bad_ref.append(row['number'])
bad_ref

[8,
 24,
 31,
 36,
 48,
 50,
 52,
 60,
 69,
 70,
 74,
 75,
 76,
 82,
 88,
 94,
 97,
 102,
 103,
 108,
 109,
 110,
 113,
 116,
 126,
 128,
 132,
 133,
 134,
 145,
 147,
 150,
 152,
 153,
 158,
 162,
 163,
 166,
 167,
 168,
 169,
 170,
 172,
 175,
 178,
 181,
 182,
 183,
 186,
 187,
 188,
 191,
 195,
 196,
 197,
 198,
 200,
 201,
 204,
 207,
 209,
 211,
 213,
 215,
 217,
 220,
 221,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237]

In [25]:
leftover_var_Rafique

Unnamed: 0,Gene,Nucleotide position,Protein position,Accession number,Gnom AD frequency,Country,Publication Year,Reference,ACMG\n (Intervar)*,transcript_stable_id,ref_protein_seq,NM_acc,DNA_coords,ensembl_id
0,GCK,c.748C>T,p.Arg250Cys,NP_000153.1\n (17204055),,Serbia,2006,-2,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
1,GCK,c.182G>A,p.T61I,NP_000153.1\n (8433729),,Spain,2000,-3,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
2,GCK,c.358C>T,p.A120T,,,Spain,,-3,VUS,,,,,
3,GCK,c.238delT,M238fsdelT,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
4,GCK,c.226deltinsAA,V226fsdelTinsAA,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,ABCC8,,p.Arg826Trp,NP_001274103.1,,Argentina,2016,-236,,,MPLAFCGSENHSAAYRVDQGVLNNGCFVDALNVVPHVFLLFITFPI...,NM_000352.4,,
808,HNF1A,HNF1α ex2-3del mutation,,,,New Zealand,2013,-237,,,,,,
809,HNF1A,c.956-1G>C,,NM_000545.6\n NP_000536.5,,,,-238,,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996261G>C,
810,HNF1A,c.1192C>G,p.Gln398Glu,NM_000545.6\n NP_000536.5,,,,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996625C>G,


In [26]:
#Add a new column with references as list items
ref_list_new_column = []
for ref in leftover_var_Rafique['Reference']:
    
    if '-' in ref:
        norm_ref = [int(ref[1:])]
        ref_list_new_column.append(norm_ref)
        
    elif '(' in ref:
        ref_list_new_column.append(ref.replace('(', '').replace(')', '').split(', '))
        
    else: ref_list_new_column.append([])
ref_list_new_column

[[2],
 [3],
 [3],
 [3],
 [3],
 [3],
 [4],
 [5],
 [5],
 [5],
 [6],
 [6],
 [6],
 [6],
 [6],
 [6],
 [6],
 [6],
 [6],
 [7],
 [8],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [9],
 [10],
 [12],
 [13],
 [14],
 [14],
 [14],
 [14],
 [15],
 [15],
 [16],
 [17],
 [18],
 ['18', '19'],
 [20],
 [20],
 [20],
 [20],
 [20],
 [21],
 [21],
 [21],
 [22],
 [22],
 [22],
 [22],
 [22],
 [22],
 ['23', '24'],
 ['23', '24'],
 ['23', '24'],
 [25],
 [25],
 [25],
 [25],
 [25],
 [26],
 [26],
 [26],
 [26],
 [26],
 [26],
 [26],
 [27],
 [28],
 [28],
 [28],
 [28],
 [28],
 [29],
 [29],
 [29],
 [29],
 [30],
 [30],
 [30],
 [30],
 [30],
 [30],
 [30],
 [31],
 [31],
 [32],
 [32],
 [32],
 [32],
 [33],
 [33],
 [35],
 [35],
 [35],
 [35],
 [35],
 [36],
 [37],
 [38],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [39],
 [45],
 [45],
 [45],
 [45],
 [45],
 [45],
 [45],


In [27]:
leftover_var_Rafique['Reference_lists'] = ref_list_new_column
leftover_var_Rafique

Unnamed: 0,Gene,Nucleotide position,Protein position,Accession number,Gnom AD frequency,Country,Publication Year,Reference,ACMG\n (Intervar)*,transcript_stable_id,ref_protein_seq,NM_acc,DNA_coords,ensembl_id,Reference_lists
0,GCK,c.748C>T,p.Arg250Cys,NP_000153.1\n (17204055),,Serbia,2006,-2,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,,[2]
1,GCK,c.182G>A,p.T61I,NP_000153.1\n (8433729),,Spain,2000,-3,VUS,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,,[3]
2,GCK,c.358C>T,p.A120T,,,Spain,,-3,VUS,,,,,,[3]
3,GCK,c.238delT,M238fsdelT,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,,[3]
4,GCK,c.226deltinsAA,V226fsdelTinsAA,NP_000153.1,,Spain,,-3,,ENST00000403799.8,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_000162,,,[3]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,ABCC8,,p.Arg826Trp,NP_001274103.1,,Argentina,2016,-236,,,MPLAFCGSENHSAAYRVDQGVLNNGCFVDALNVVPHVFLLFITFPI...,NM_000352.4,,,[236]
808,HNF1A,HNF1α ex2-3del mutation,,,,New Zealand,2013,-237,,,,,,,[237]
809,HNF1A,c.956-1G>C,,NM_000545.6\n NP_000536.5,,,,-238,,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996261G>C,,[238]
810,HNF1A,c.1192C>G,p.Gln398Glu,NM_000545.6\n NP_000536.5,,,,-238,VUS,,MVSKLSQLQTELLAALLESGLSKEALIQALGEPGPYLLAGEGPLDK...,NM_000545.6,NC_000012.12:g.120996625C>G,,[238]


In [28]:
index_list = []
for index, row in leftover_var_Rafique.iterrows():
    for bad_reference in bad_ref:
        if bad_reference in row['Reference_lists']:
            index_list.append(index)
Rafique_var_for_manual_rescue = leftover_var_Rafique[leftover_var_Rafique.index.isin(index_list)].reset_index(drop=True)
Rafique_var_for_manual_rescue

Unnamed: 0,Gene,Nucleotide position,Protein position,Accession number,Gnom AD frequency,Country,Publication Year,Reference,ACMG\n (Intervar)*,transcript_stable_id,ref_protein_seq,NM_acc,DNA_coords,ensembl_id,Reference_lists
0,GCK,c.880_891delinsCATGGCGAGCTGGTGT,p.Gly294HisfsTer179),NM_001354800.1\n NP_001341729.1,,Argentina,2016,-8,,,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_001354800.1,NC_000007.14:g.44146591_44146602delinsACACCAGC...,,[8]
1,GCK,32insC(33)intron3,,,,Germany,2004,-31,,,,,,,[31]
2,GCK,39insC(40)intron3,,,,Germany,,-31,,,,,,,[31]
3,GCK,c.46-15_46-11del nsGGGAGGG,,,,USA,2010,-36,,,,,,,[36]
4,GCK,c.-457C > T,,NM_001354800.1\n NP_001341729.1,3.18E-05,Tunisia,2018,-48,,,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,NM_001354800.1,NC_000007.14:g.44189410G>A,,[48]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,HNF1A,Exon 1-10,,,,UK,,-233,,,,,,,[233]
107,HNF1B,HNF1b deletion,,,,Germany,2009,-234,,,,,,,[234]
108,HNF1B,HNF1b deletion,,,,Germany,2018,-235,,,,,,,[235]
109,ABCC8,,p.Arg826Trp,NP_001274103.1,,Argentina,2016,-236,,,MPLAFCGSENHSAAYRVDQGVLNNGCFVDALNVVPHVFLLFITFPI...,NM_000352.4,,,[236]


In [29]:
Rafique_var_for_manual_rescue.to_csv(
    'Rafique_var_for_manual_rescue.csv',
    header=True, index=False)