# Statistical Analysis for errors in 100 Stable Cochrane Reviews

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("100_Stable_Reviews_Yogeshwar.xlsx", encoding = "latin1")

In [3]:
df.head()

Unnamed: 0,Study ID,Study Name,Title,Alternative Title,Authors,Type,Reference ID,ID Matched,Reference ID Type,Reference ID Other Type,Published Journal,Year,Volume,Page,Cochrane Review
0,STD-Acuna-1981,Acuna 1981,Ketoconazole prophylaxis of fungal infections ...,,"Acuna G, Winston DJ, Young LS",CONFERENCE_PROC,3405716,NOT_FOUND;INVALID_JOURNAL,OTHER,CRSREF,Program and Abstracts of 22nd Interscience Con...,1981,,abstract 852,CD000026 v. 16.0 Routine versus selective anti...
1,STD-Benhamou-1991a,Benhamou 1991a,Does ketoconazole prevent fungal infection in ...,,"Benhamou E, Hartmann O, Nogues C, Maraninchi D...",JOURNAL_ARTICLE,3405718,2049556,OTHER,CRSREF,Bone Marrow Transplant,1991,7,127-31,CD000026 v. 16.0 Routine versus selective anti...
2,STD-Benhamou-1991a,Benhamou 1991a,Results of a randomized double-blind placebo c...,,"Hartmann O, Benhamou E, Maraninchi D, Valteau ...",JOURNAL_ARTICLE,3405719,NOT_FOUND,OTHER,CRSREF,Bone Marrow Transplant,1990,5 Suppl 2,19,CD000026 v. 16.0 Routine versus selective anti...
3,STD-Brincker-1978,Brincker 1978,Prophylactic treatment with miconazole in pati...,,Brincker H,JOURNAL_ARTICLE,3405721,356523,OTHER,CRSREF,Acta Medica Scandinavica,1978,204,123-8,CD000026 v. 16.0 Routine versus selective anti...
4,STD-Brincker-1983,Brincker 1983,Prevention of mycosis in granulocytopenic pati...,,Brincker H,JOURNAL_ARTICLE,3405723,6308441,OTHER,CRSREF,Mykosen,1983,26,242-7,CD000026 v. 16.0 Routine versus selective anti...


### Number of articles in 100 reviews
There are 1695 articles and each article has 15 attributes as meta data. Out of which , 1099 articles are matched.

In [4]:
df.shape

(1695, 15)

In [6]:
# Separting the matched articles
temp = []
for i in df['ID Matched ']:
    if type(i) == int:
        temp.append(i)

In [8]:
len(temp)

1099

In [9]:
rct_matched = pd.read_csv("rct_scores.csv")

In [43]:
rct_matched.columns = ['ID Matched ', 'year', 'citation-only', 'citation-plus-mesh']

In [57]:
rct_matched.head()

Unnamed: 0,ID Matched,year,citation-only,citation-plus-mesh
0,2049556,1991,0.99736,0.9976
1,356523,1978,0.9383,0.73096
2,1965007,1990,0.95086,0.98848
3,2658574,1989,0.99937,0.99836
4,6087753,1984,0.39503,0.44073


In [58]:
rct_matched[rct_matched['citation-plus-mesh'] > 0.01]

Unnamed: 0,ID Matched,year,citation-only,citation-plus-mesh
0,2049556,1991,0.99736,0.99760
1,356523,1978,0.93830,0.73096
2,1965007,1990,0.95086,0.98848
3,2658574,1989,0.99937,0.99836
4,6087753,1984,0.39503,0.44073
5,7933535,1994,0.56619,0.58298
6,7703926,1994,0.56669,0.52547
7,8131635,1994,0.98894,0.99359
8,8182012,1994,0.98684,0.99082
9,1542320,1992,0.99966,0.99979


In [44]:
rct_matched.shape

(1002, 4)

In [46]:
rct_matched['ID Matched ']

0        2049556
1         356523
2        1965007
3        2658574
4        6087753
5        7933535
6        7703926
7        8131635
8        8182012
9        1542320
10       3548626
11       6304203
12      12673406
13      10197802
14       9669812
15      10064240
16      10671332
17       1588260
18      16766594
19       1569339
20       7985709
21      11102422
22      10064252
23       7561177
24      10979947
25       7769290
26      17580253
27       7411865
28       8136741
29       8124207
          ...   
972      6293073
973      1503915
974      8318411
975      1098764
976       324570
977       367510
978       444406
979      3513684
980      2920133
981      3986070
982      8471588
983      7672875
984      3287907
985      3287907
986     17692717
987     15333415
988     15200753
989     12036167
990     12638399
991     15502051
992      9220215
993     15220020
994     15922815
995     15881486
996     16855076
997     10868557
998     15763605
999      82224

In [47]:
matched_df = df[df['ID Matched '].isin(rct_matched['ID Matched '])]

In [71]:
not_matched = list(set(temp) - set(rct_matched['ID Matched ']))

In [74]:
len(not_matched)

76

In [77]:
unmatched_df = df[df['ID Matched '].isin(not_matched)]

In [54]:
matched_df.head()

Unnamed: 0,Study ID,Study Name,Title,Alternative Title,Authors,Type,Reference ID,ID Matched,Reference ID Type,Reference ID Other Type,Published Journal,Year,Volume,Page,Cochrane Review
1,STD-Benhamou-1991a,Benhamou 1991a,Does ketoconazole prevent fungal infection in ...,,"Benhamou E, Hartmann O, Nogues C, Maraninchi D...",JOURNAL_ARTICLE,3405718,2049556,OTHER,CRSREF,Bone Marrow Transplant,1991,7,127-31,CD000026 v. 16.0 Routine versus selective anti...
3,STD-Brincker-1978,Brincker 1978,Prophylactic treatment with miconazole in pati...,,Brincker H,JOURNAL_ARTICLE,3405721,356523,OTHER,CRSREF,Acta Medica Scandinavica,1978,204,123-8,CD000026 v. 16.0 Routine versus selective anti...
5,STD-Caselli-1990,Caselli 1990,Antifungal chemoprophylaxis in cancer children...,,"Caselli D, Arico M, Michelone G, Cavanna C, Ne...",JOURNAL_ARTICLE,3405725,1965007,OTHER,CRSREF,Microbiologica,1990,13,347-51,CD000026 v. 16.0 Routine versus selective anti...
6,STD-EORTC-1989,EORTC 1989,Empiric antifungal therapy in febrile granuloc...,,EORTC International Antimicrobial Therapy Coop...,JOURNAL_ARTICLE,3405727,2658574,OTHER,CRSREF,The American Journal of Medicine,1989,86,668-72,CD000026 v. 16.0 Routine versus selective anti...
7,STD-Estey-1984,Estey 1984,Infection prophylaxis in acute leukemia. Compa...,,"Estey E, Maksymiuk A, Smith T, Fainstein V, Ke...",JOURNAL_ARTICLE,3405729,6087753,OTHER,CRSREF,Archives of Internal Medicine,1984,144,1562-8,CD000026 v. 16.0 Routine versus selective anti...


In [78]:
unmatched_df.head()

Unnamed: 0,Study ID,Study Name,Title,Alternative Title,Authors,Type,Reference ID,ID Matched,Reference ID Type,Reference ID Other Type,Published Journal,Year,Volume,Page,Cochrane Review
4,STD-Brincker-1983,Brincker 1983,Prevention of mycosis in granulocytopenic pati...,,Brincker H,JOURNAL_ARTICLE,3405723.0,6308441,OTHER,CRSREF,Mykosen,1983,26,242-7,CD000026 v. 16.0 Routine versus selective anti...
32,STD-Pizzo-1982,Pizzo 1982,Empiric antibiotic and antifungal therapy for ...,,"Pizzo PA, Robichaud KJ, Gill FA, Witebsky FG",JOURNAL_ARTICLE,3405768.0,7058815,OTHER,CRSREF,The American Journal of Medicine,1982,72,101-11,CD000026 v. 16.0 Routine versus selective anti...
45,STD-Tollemar-1993,Tollemar 1993,Fungal prophylaxis with AmBisome in liver and ...,,"Tollemar J, Hockerstedt K, Ericzon BG, Sundber...",JOURNAL_ARTICLE,3405788.0,8030161,OTHER,CRSREF,Transplantation Proceedings,1994,26,1833,CD000026 v. 16.0 Routine versus selective anti...
47,STD-Tollemar-1993,Tollemar 1993,Prophylactic use of liposomal amphotericin B (...,,"Tollemar J, Ringden O, Andersson S, Sundberg B...",JOURNAL_ARTICLE,3405790.0,8442163,OTHER,CRSREF,Transplantation Proceedings,1993,25,1495-7,CD000026 v. 16.0 Routine versus selective anti...
67,STD-Dabis-AN_x002c_-IP_x002c_-PNm,"Dabis AN, IP, PNm",Prevention of mother-to-child transmission of ...,,"Msellati P, Ramon R, Viho I et al",JOURNAL_ARTICLE,,9677183,,,AIDS,1998,12,1257-8,CD000102 v. 12.0 Interventions for reducing th...


In [48]:
matched_df.shape

(1002, 15)

In [79]:
unmatched_df.shape

(97, 15)

In [82]:
unmatched_df['ID Matched '].value_counts()

4867932     5
4873222     5
13969123    5
5643387     5
9708404     4
8977514     2
9525400     2
2936373     1
15138641    1
4616883     1
13884081    1
1191165     1
10224440    1
7454128     1
17656293    1
2220844     1
4892340     1
13907771    1
14575938    1
8067290     1
14045220    1
9677183     1
4891424     1
727035      1
5477261     1
2490035     1
14575944    1
10235147    1
7988613     1
8009608     1
           ..
21090384    1
5338474     1
7084265     1
8442163     1
4158171     1
10402419    1
9729145     1
812666      1
4861208     1
14476668    1
4681896     1
3254707     1
4189785     1
4100347     1
4809808     1
14226395    1
8656837     1
16682694    1
4314823     1
376136      1
14575941    1
8030161     1
13892318    1
4177491     1
3904340     1
7058815     1
6308441     1
2197628     1
801115      1
5845249     1
Name: ID Matched , Length: 76, dtype: int64

In [53]:
rct_matched['ID Matched '].tolist() == matched_df['ID Matched '].tolist()

True

In [50]:
matched_df1 = pd.merge(matched_df, rct_matched)

In [51]:
matched_df1.shape

(1226, 18)

In [55]:
matched_df.to_csv("Matched_metadata.csv")

In [56]:
matched_df1.to_csv("Matched_metadata1.csv")

In [60]:
matched_group = pd.read_csv("Matched_metadata.csv")

In [61]:
matched_group.head()

Unnamed: 0,Study ID,Study Name,Title,Alternative Title,Authors,Type,Reference ID,ID Matched,Reference ID Type,Reference ID Other Type,Published Journal,Year,Volume,Page,Cochrane Review,year,citation-only,citation-plus-mesh
0,STD-Litkowski-2005,Litkowski 2005,Analgesic efficacy and tolerability of oxycodo...,,"Litkowski LJ, Christensen SE, Adamson DN, Van ...",JOURNAL_ARTICLE,2736006.0,15922815,OTHER,CRSREF,Clinical Therapeutics,2005,27,418-29,CD002763 v. 11.0 Single dose oral oxycodone an...,2005,0.99998,0.99997
1,STD-Chapman-2002,Chapman 2002,The addition of salmeterol 50 mcg bid to antic...,,"Chapman K, Arvidsson P, Chuchalin A, Dhillon D...",JOURNAL_ARTICLE,,12068339,,,Canadian Respiratory Journal,2002,9,178-85,CD001104 v. 7.0 Long-acting beta2-agonists for...,2002,0.99988,0.99995
2,STD-Lazarus-2001,Lazarus 2001,Long-acting ß2-agonist monotherapy versus cont...,,"Lazarus SC, Boushey H, Fahy JV, Chinchilli VM,...",JOURNAL_ARTICLE,,11368732,,,Journal of the American Medical Association,2001,285,2583-93,CD001385 v. 9.0 Long-acting beta2-agonists for...,2001,0.99992,0.99992
3,STD-EACG-017,EACG 017,Zidovudine twice daily in asymptomatic subject...,,"Mulder JW, Cooper DA, Mathiesen L, Sandstrom E...",JOURNAL_ARTICLE,,7913326,,,AIDS,1994,8,313-321,CD002039 v. 5.0 Immediate versus deferred zido...,1994,0.9996,0.99992
4,STD-Winston-1993,Winston 1993,Fluconazole prophylaxis of fungal infections i...,,"Winston DJ, Chandrasekar PH, Lazarus HM, Goodm...",JOURNAL_ARTICLE,3405798.0,8442620,OTHER,CRSREF,Annals of Internal Medicine,1993,118,495-503,CD000026 v. 16.0 Routine versus selective anti...,1993,0.99991,0.99991


In [83]:
unmatched_df.to_csv("unmatched_group.csv")

In [62]:
matched_group2 = matched_group[matched_group['citation-plus-mesh'] > 0.01]

In [84]:
matched_group2.to_csv("matched_group2.csv")

In [63]:
matched_group3 = matched_group[matched_group['citation-plus-mesh'] <= 0.01]

In [85]:
matched_group3.to_csv("matched_group3.csv")

In [6]:
df1 = df[['Cochrane Review', 'Study ID', 'ID Matched ']]

In [7]:
df1.head()

Unnamed: 0,Cochrane Review,Study ID,ID Matched
0,CD000026 v. 16.0 Routine versus selective anti...,STD-Acuna-1981,NOT_FOUND;INVALID_JOURNAL
1,CD000026 v. 16.0 Routine versus selective anti...,STD-Benhamou-1991a,2049556
2,CD000026 v. 16.0 Routine versus selective anti...,STD-Benhamou-1991a,NOT_FOUND
3,CD000026 v. 16.0 Routine versus selective anti...,STD-Brincker-1978,356523
4,CD000026 v. 16.0 Routine versus selective anti...,STD-Brincker-1983,6308441


In [8]:
df1.shape

(1695, 3)

In [9]:
df1.shape[0]

1695

In [11]:
df1.loc[1, 'ID Matched ']

2049556

In [12]:
for i in range(df1.shape[0]):
    if type(df1.loc[i, "ID Matched "]) == int:
        df1.loc[i, "ID Matched "] = 'MATCHED'
    if "AMBIGUOUS" in df1.loc[i, "ID Matched "]:
        df1.loc[i, "ID Matched "] = 'AMBIGUOUS'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [13]:
df1.head()

Unnamed: 0,Cochrane Review,Study ID,ID Matched
0,CD000026 v. 16.0 Routine versus selective anti...,STD-Acuna-1981,NOT_FOUND;INVALID_JOURNAL
1,CD000026 v. 16.0 Routine versus selective anti...,STD-Benhamou-1991a,MATCHED
2,CD000026 v. 16.0 Routine versus selective anti...,STD-Benhamou-1991a,NOT_FOUND
3,CD000026 v. 16.0 Routine versus selective anti...,STD-Brincker-1978,MATCHED
4,CD000026 v. 16.0 Routine versus selective anti...,STD-Brincker-1983,MATCHED


In [14]:
df1.shape

(1695, 3)

In [15]:
df1['ID Matched '].value_counts()

MATCHED                      1099
NOT_FOUND;INVALID_JOURNAL     333
NOT_FOUND                     259
AMBIGUOUS                       4
Name: ID Matched , dtype: int64

In [41]:
df2 = df1.groupby(['Cochrane Review', 'Study ID', 'ID Matched ']).size().reset_index()

In [42]:
df2.shape

(1287, 4)

In [44]:
df2.columns = ['Cochrane Review', 'Study ID', 'ID Matched','Count']

In [45]:
df3 = df2.pivot_table(index = ['Cochrane Review', 'Study ID'], columns = 'ID Matched', values = 'Count')

In [50]:
df3 = df3.fillna(0)

In [51]:
df3.head()

Unnamed: 0_level_0,ID Matched,AMBIGUOUS,MATCHED,NOT_FOUND,NOT_FOUND;INVALID_JOURNAL
Cochrane Review,Study ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Acuna-1981,0.0,0.0,0.0,1.0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Benhamou-1991a,0.0,1.0,1.0,0.0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Brincker-1978,0.0,1.0,0.0,0.0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Brincker-1983,0.0,1.0,0.0,0.0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Caselli-1990,0.0,1.0,0.0,0.0


In [62]:
print("Total number of Ambiguous errros are:" ,df3['AMBIGUOUS'].sum())
print("Total number of Not_Found errros are:" ,df3['NOT_FOUND'].sum())
print("Total number of Invalid Journal errros are:" ,df3['NOT_FOUND;INVALID_JOURNAL'].sum())
print("Total number of Matched articles are:" ,df3['MATCHED'].sum())

Total number of Ambiguous errros are: 4
Total number of Not_Found errros are: 259
Total number of Invalid Journal errros are: 333
Total number of Matched articles are: 1099


In [60]:
cols = ['AMBIGUOUS','NOT_FOUND','NOT_FOUND;INVALID_JOURNAL','MATCHED']
df3[cols] = df3[cols].applymap(np.int64)

In [61]:
df3.head()

Unnamed: 0_level_0,ID Matched,AMBIGUOUS,MATCHED,NOT_FOUND,NOT_FOUND;INVALID_JOURNAL
Cochrane Review,Study ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Acuna-1981,0,0,0,1
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Benhamou-1991a,0,1,1,0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Brincker-1978,0,1,0,0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Brincker-1983,0,1,0,0
CD000026 v. 16.0 Routine versus selective antifungal administration for control of fungal infections in patients with.rm5,STD-Caselli-1990,0,1,0,0


In [63]:
df3.to_csv("Error Mapping.csv")