Parsing Medline file

In [None]:
from Bio import Medline
import pandas as pd

In [None]:
from Bio import Medline

# Open the output CSV file
with open("table1.csv", "w") as file:
    # Write the header line for clarity
    file.write("Journal\tTitle\tAuthors\tYear\tAbstract\tPaper type\n")

    # Open the MEDLINE file
    with open("/content/pmc_result.medline") as handle:
        records = Medline.parse(handle)

        # Iterate through each record in the MEDLINE file
        for record in records:
            # Use .get() to handle missing fields
            title = record.get("TI", "No title")
            journal = record.get("JT", "No journal")
            authors = "; ".join(record.get("FAU", ["No authors"]))
            year = record.get("DP", "N/A").split(" ")[0]  # Get year, or N/A if missing
            abstract = record.get("AB", "No abstract")
            paper_type = record.get("PT", "No paper type")

            # Write each record's details into the CSV file
            file.write(f"{journal}\t{title}\t{authors}\t{year}\t{abstract}\t{paper_type}\n")

In [None]:
df = pd.read_csv("/content/table1.csv", sep="\t")
df.columns = ["Journal", "Title", "Authors", "Year", "Abstract", "Paper_type"]
df

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
0,Clinical Microbiology Reviews,Natural-Product-Based Solutions for Tropical I...,"Adegboye, Oyelola; Field, Matt A.; Kupz, Andre...",,About half of the world’s population and 80% o...,"['Journal Article', 'Review']"
1,BMC Complementary and Alternative Medicine,"Antibacterial activities, proposed mode of act...","Mombeshora, Molly; Mukanganyama, Stanley",2019.0,Background: Pseudomonas aeruginosa has become ...,['Journal Article']
2,Molecules : A Journal of Synthetic Chemistry a...,Anti-Trypanosomatid Elemanolide Sesquiterpene ...,"Kimani, Njogu M.; Matasyoh, Josphat C.; Kaiser...",2017.0,Sleeping sickness or human African trypanosomi...,['Journal Article']
3,BMC Complementary Medicine and Therapies,Phytochemical analysis and versatile in vitro ...,"Jan, Hasnain; Usman, Hazrat; Shah, Muzamil; Za...",2021.0,Background: Himalayan Columbine (Aquilegia pub...,['Journal Article']
4,Viruses,Antiviral and Cytotoxic Activity of Different ...,"Panda, Sujogya Kumar; Castro, Ana Hortência Fo...",2020.0,Chikungunya and yellow fever virus cause vecto...,['Journal Article']
...,...,...,...,...,...,...
8246,ACS Chemical Neuroscience,Tanshinones Inhibit Amyloid Aggregation by Amy...,"Wang, Qiuming; Yu, Xiang; Patal, Kunal; Hu, Ru...",2013.0,The misfolding and aggregation of amyloid-β (A...,['Journal Article']
8247,Acta Pharmacologica Sinica,Salvianolic acid B ameliorates depressive-like...,"Zhang, Jin-qiang; Wu, Xiao-hui; Feng, Yi; Xie,...",2016.0,Aim:: Major depressive disorder (MDD) is a deb...,['Journal Article']
8248,PLoS ONE,Cardiac Microvascular Barrier Function Mediate...,"Qi, Kang; Li, Lujin; Li, Xiangdong; Zhao, Jing...",2015.0,Objective: Tongxinluo (TXL) has been shown to ...,['Journal Article']
8249,Molecules,Production of Salvianolic Acid B in Roots of S...,"Li, Xiao-Bing; Wang, Wei; Zhou, Guo-Jun; Li, Y...",2012.0,Drying is the most common and fundamental proc...,['Journal Article']


Screening

In [None]:
table_1_with_PT = df.drop(df[df["Paper_type"] == "NaN"].index)
table_1_with_PT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8251 entries, 0 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     8251 non-null   object 
 1   Title       8251 non-null   object 
 2   Authors     8251 non-null   object 
 3   Year        8022 non-null   float64
 4   Abstract    8198 non-null   object 
 5   Paper_type  8251 non-null   object 
dtypes: float64(1), object(5)
memory usage: 386.9+ KB


In [None]:
#drop records without Paper type
table_1_with_PT = df.drop(df[df["Paper_type"] == "No paper type"].index)
table_1_with_PT

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
0,Clinical Microbiology Reviews,Natural-Product-Based Solutions for Tropical I...,"Adegboye, Oyelola; Field, Matt A.; Kupz, Andre...",,About half of the world’s population and 80% o...,"['Journal Article', 'Review']"
1,BMC Complementary and Alternative Medicine,"Antibacterial activities, proposed mode of act...","Mombeshora, Molly; Mukanganyama, Stanley",2019.0,Background: Pseudomonas aeruginosa has become ...,['Journal Article']
2,Molecules : A Journal of Synthetic Chemistry a...,Anti-Trypanosomatid Elemanolide Sesquiterpene ...,"Kimani, Njogu M.; Matasyoh, Josphat C.; Kaiser...",2017.0,Sleeping sickness or human African trypanosomi...,['Journal Article']
3,BMC Complementary Medicine and Therapies,Phytochemical analysis and versatile in vitro ...,"Jan, Hasnain; Usman, Hazrat; Shah, Muzamil; Za...",2021.0,Background: Himalayan Columbine (Aquilegia pub...,['Journal Article']
4,Viruses,Antiviral and Cytotoxic Activity of Different ...,"Panda, Sujogya Kumar; Castro, Ana Hortência Fo...",2020.0,Chikungunya and yellow fever virus cause vecto...,['Journal Article']
...,...,...,...,...,...,...
8246,ACS Chemical Neuroscience,Tanshinones Inhibit Amyloid Aggregation by Amy...,"Wang, Qiuming; Yu, Xiang; Patal, Kunal; Hu, Ru...",2013.0,The misfolding and aggregation of amyloid-β (A...,['Journal Article']
8247,Acta Pharmacologica Sinica,Salvianolic acid B ameliorates depressive-like...,"Zhang, Jin-qiang; Wu, Xiao-hui; Feng, Yi; Xie,...",2016.0,Aim:: Major depressive disorder (MDD) is a deb...,['Journal Article']
8248,PLoS ONE,Cardiac Microvascular Barrier Function Mediate...,"Qi, Kang; Li, Lujin; Li, Xiangdong; Zhao, Jing...",2015.0,Objective: Tongxinluo (TXL) has been shown to ...,['Journal Article']
8249,Molecules,Production of Salvianolic Acid B in Roots of S...,"Li, Xiao-Bing; Wang, Wei; Zhou, Guo-Jun; Li, Y...",2012.0,Drying is the most common and fundamental proc...,['Journal Article']


In [None]:
table_1_with_PT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8251 entries, 0 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     8251 non-null   object 
 1   Title       8251 non-null   object 
 2   Authors     8251 non-null   object 
 3   Year        8022 non-null   float64
 4   Abstract    8198 non-null   object 
 5   Paper_type  8251 non-null   object 
dtypes: float64(1), object(5)
memory usage: 386.9+ KB


In [None]:
#drop records with Paper type: Review and co
table_1_filter = table_1_with_PT.drop(table_1_with_PT[table_1_with_PT["Paper_type"] == "['Journal Article', 'Review']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5793 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5793 non-null   object 
 1   Title       5793 non-null   object 
 2   Authors     5793 non-null   object 
 3   Year        5630 non-null   float64
 4   Abstract    5761 non-null   object 
 5   Paper_type  5793 non-null   object 
dtypes: float64(1), object(5)
memory usage: 316.8+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'News']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5760 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5760 non-null   object 
 1   Title       5760 non-null   object 
 2   Authors     5760 non-null   object 
 3   Year        5598 non-null   float64
 4   Abstract    5728 non-null   object 
 5   Paper_type  5760 non-null   object 
dtypes: float64(1), object(5)
memory usage: 315.0+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Editorial']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5706 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5706 non-null   object 
 1   Title       5706 non-null   object 
 2   Authors     5706 non-null   object 
 3   Year        5545 non-null   float64
 4   Abstract    5674 non-null   object 
 5   Paper_type  5706 non-null   object 
dtypes: float64(1), object(5)
memory usage: 312.0+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Published Erratum']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5706 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5706 non-null   object 
 1   Title       5706 non-null   object 
 2   Authors     5706 non-null   object 
 3   Year        5545 non-null   float64
 4   Abstract    5674 non-null   object 
 5   Paper_type  5706 non-null   object 
dtypes: float64(1), object(5)
memory usage: 312.0+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Case Report']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5692 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5692 non-null   object 
 1   Title       5692 non-null   object 
 2   Authors     5692 non-null   object 
 3   Year        5531 non-null   float64
 4   Abstract    5660 non-null   object 
 5   Paper_type  5692 non-null   object 
dtypes: float64(1), object(5)
memory usage: 311.3+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Journal Article', 'Review']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5692 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5692 non-null   object 
 1   Title       5692 non-null   object 
 2   Authors     5692 non-null   object 
 3   Year        5531 non-null   float64
 4   Abstract    5660 non-null   object 
 5   Paper_type  5692 non-null   object 
dtypes: float64(1), object(5)
memory usage: 311.3+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Letter']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5678 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5678 non-null   object 
 1   Title       5678 non-null   object 
 2   Authors     5678 non-null   object 
 3   Year        5517 non-null   float64
 4   Abstract    5646 non-null   object 
 5   Paper_type  5678 non-null   object 
dtypes: float64(1), object(5)
memory usage: 310.5+ KB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Comment']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5662 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5662 non-null   object 
 1   Title       5662 non-null   object 
 2   Authors     5662 non-null   object 
 3   Year        5501 non-null   float64
 4   Abstract    5630 non-null   object 
 5   Paper_type  5662 non-null   object 
dtypes: float64(1), object(5)
memory usage: 309.6+ KB


In [None]:
table_1_filter

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
1,BMC Complementary and Alternative Medicine,"Antibacterial activities, proposed mode of act...","Mombeshora, Molly; Mukanganyama, Stanley",2019.0,Background: Pseudomonas aeruginosa has become ...,['Journal Article']
2,Molecules : A Journal of Synthetic Chemistry a...,Anti-Trypanosomatid Elemanolide Sesquiterpene ...,"Kimani, Njogu M.; Matasyoh, Josphat C.; Kaiser...",2017.0,Sleeping sickness or human African trypanosomi...,['Journal Article']
3,BMC Complementary Medicine and Therapies,Phytochemical analysis and versatile in vitro ...,"Jan, Hasnain; Usman, Hazrat; Shah, Muzamil; Za...",2021.0,Background: Himalayan Columbine (Aquilegia pub...,['Journal Article']
4,Viruses,Antiviral and Cytotoxic Activity of Different ...,"Panda, Sujogya Kumar; Castro, Ana Hortência Fo...",2020.0,Chikungunya and yellow fever virus cause vecto...,['Journal Article']
5,Molecules,Antiprotozoal Activity of Plants Used in the M...,"Vahekeni, Nina; Brillatz, Théo; Rahmaty, Marja...",2024.0,"Folk medicine is widely used in Angola, even f...",['Journal Article']
...,...,...,...,...,...,...
8246,ACS Chemical Neuroscience,Tanshinones Inhibit Amyloid Aggregation by Amy...,"Wang, Qiuming; Yu, Xiang; Patal, Kunal; Hu, Ru...",2013.0,The misfolding and aggregation of amyloid-β (A...,['Journal Article']
8247,Acta Pharmacologica Sinica,Salvianolic acid B ameliorates depressive-like...,"Zhang, Jin-qiang; Wu, Xiao-hui; Feng, Yi; Xie,...",2016.0,Aim:: Major depressive disorder (MDD) is a deb...,['Journal Article']
8248,PLoS ONE,Cardiac Microvascular Barrier Function Mediate...,"Qi, Kang; Li, Lujin; Li, Xiangdong; Zhao, Jing...",2015.0,Objective: Tongxinluo (TXL) has been shown to ...,['Journal Article']
8249,Molecules,Production of Salvianolic Acid B in Roots of S...,"Li, Xiao-Bing; Wang, Wei; Zhou, Guo-Jun; Li, Y...",2012.0,Drying is the most common and fundamental proc...,['Journal Article']


In [None]:
table_1_filter[table_1_filter["Abstract"] == "NaN"]
table_1_filter

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
1,BMC Complementary and Alternative Medicine,"Antibacterial activities, proposed mode of act...","Mombeshora, Molly; Mukanganyama, Stanley",2019.0,Background: Pseudomonas aeruginosa has become ...,['Journal Article']
2,Molecules : A Journal of Synthetic Chemistry a...,Anti-Trypanosomatid Elemanolide Sesquiterpene ...,"Kimani, Njogu M.; Matasyoh, Josphat C.; Kaiser...",2017.0,Sleeping sickness or human African trypanosomi...,['Journal Article']
3,BMC Complementary Medicine and Therapies,Phytochemical analysis and versatile in vitro ...,"Jan, Hasnain; Usman, Hazrat; Shah, Muzamil; Za...",2021.0,Background: Himalayan Columbine (Aquilegia pub...,['Journal Article']
4,Viruses,Antiviral and Cytotoxic Activity of Different ...,"Panda, Sujogya Kumar; Castro, Ana Hortência Fo...",2020.0,Chikungunya and yellow fever virus cause vecto...,['Journal Article']
5,Molecules,Antiprotozoal Activity of Plants Used in the M...,"Vahekeni, Nina; Brillatz, Théo; Rahmaty, Marja...",2024.0,"Folk medicine is widely used in Angola, even f...",['Journal Article']
...,...,...,...,...,...,...
8246,ACS Chemical Neuroscience,Tanshinones Inhibit Amyloid Aggregation by Amy...,"Wang, Qiuming; Yu, Xiang; Patal, Kunal; Hu, Ru...",2013.0,The misfolding and aggregation of amyloid-β (A...,['Journal Article']
8247,Acta Pharmacologica Sinica,Salvianolic acid B ameliorates depressive-like...,"Zhang, Jin-qiang; Wu, Xiao-hui; Feng, Yi; Xie,...",2016.0,Aim:: Major depressive disorder (MDD) is a deb...,['Journal Article']
8248,PLoS ONE,Cardiac Microvascular Barrier Function Mediate...,"Qi, Kang; Li, Lujin; Li, Xiangdong; Zhao, Jing...",2015.0,Objective: Tongxinluo (TXL) has been shown to ...,['Journal Article']
8249,Molecules,Production of Salvianolic Acid B in Roots of S...,"Li, Xiao-Bing; Wang, Wei; Zhou, Guo-Jun; Li, Y...",2012.0,Drying is the most common and fundamental proc...,['Journal Article']


In [None]:
#detect records without abstract
table_1_filter[table_1_filter["Abstract"] == "No abstract"]

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
67,FEBS Open Bio,Posters.,No authors,2024.0,No abstract,['Journal Article']
188,Journal of Fungi,9th Trends in Medical Mycology Held on 11–14 O...,"Gangneux, Jean-Pierre; Lortholary, Olivier; Co...",2019.0,No abstract,['Journal Article']
419,FEBS Open Bio,Posters.,No authors,2023.0,No abstract,['Journal Article']
740,Journal of Extracellular Vesicles,ISEV2024 Abstract Book.,No authors,2024.0,No abstract,['Journal Article']
857,VirusDisease,Abstracts of the papers presented in the inter...,No authors,2023.0,No abstract,['Journal Article']
...,...,...,...,...,...,...
8159,London journal of medicine,Digest of the Journals.,No authors,1852.0,No abstract,['Journal Article']
8163,British Medical Journal,The British Medical Journal.,No authors,1887.0,No abstract,['Journal Article']
8165,Medical History. Supplement,"Animism, vitalism, and the Medical University ...",No authors,1984.0,No abstract,['Journal Article']
8178,Journal of the National Medical Association,Social service aspects of drug addiction in th...,"Beine, E. C.",1974.0,No abstract,['Journal Article']


In [None]:
#drop records without abstracts
table_1_with_abstracts = table_1_filter.drop(table_1_filter[table_1_filter["Abstract"] == "No abstract"].index)
table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5095 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5095 non-null   object 
 1   Title       5095 non-null   object 
 2   Authors     5095 non-null   object 
 3   Year        4937 non-null   float64
 4   Abstract    5063 non-null   object 
 5   Paper_type  5095 non-null   object 
dtypes: float64(1), object(5)
memory usage: 278.6+ KB


In [None]:
#drop records without abstracts
table_1_with_abstracts = table_1_with_abstracts.drop(table_1_with_abstracts[table_1_with_abstracts["Abstract"] == "NaN"].index)
table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5095 entries, 1 to 8250
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Journal     5095 non-null   object 
 1   Title       5095 non-null   object 
 2   Authors     5095 non-null   object 
 3   Year        4937 non-null   float64
 4   Abstract    5063 non-null   object 
 5   Paper_type  5095 non-null   object 
dtypes: float64(1), object(5)
memory usage: 278.6+ KB


In [None]:
table_1_with_abstracts

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type
1,BMC Complementary and Alternative Medicine,"Antibacterial activities, proposed mode of act...","Mombeshora, Molly; Mukanganyama, Stanley",2019.0,Background: Pseudomonas aeruginosa has become ...,['Journal Article']
2,Molecules : A Journal of Synthetic Chemistry a...,Anti-Trypanosomatid Elemanolide Sesquiterpene ...,"Kimani, Njogu M.; Matasyoh, Josphat C.; Kaiser...",2017.0,Sleeping sickness or human African trypanosomi...,['Journal Article']
3,BMC Complementary Medicine and Therapies,Phytochemical analysis and versatile in vitro ...,"Jan, Hasnain; Usman, Hazrat; Shah, Muzamil; Za...",2021.0,Background: Himalayan Columbine (Aquilegia pub...,['Journal Article']
4,Viruses,Antiviral and Cytotoxic Activity of Different ...,"Panda, Sujogya Kumar; Castro, Ana Hortência Fo...",2020.0,Chikungunya and yellow fever virus cause vecto...,['Journal Article']
5,Molecules,Antiprotozoal Activity of Plants Used in the M...,"Vahekeni, Nina; Brillatz, Théo; Rahmaty, Marja...",2024.0,"Folk medicine is widely used in Angola, even f...",['Journal Article']
...,...,...,...,...,...,...
8246,ACS Chemical Neuroscience,Tanshinones Inhibit Amyloid Aggregation by Amy...,"Wang, Qiuming; Yu, Xiang; Patal, Kunal; Hu, Ru...",2013.0,The misfolding and aggregation of amyloid-β (A...,['Journal Article']
8247,Acta Pharmacologica Sinica,Salvianolic acid B ameliorates depressive-like...,"Zhang, Jin-qiang; Wu, Xiao-hui; Feng, Yi; Xie,...",2016.0,Aim:: Major depressive disorder (MDD) is a deb...,['Journal Article']
8248,PLoS ONE,Cardiac Microvascular Barrier Function Mediate...,"Qi, Kang; Li, Lujin; Li, Xiangdong; Zhao, Jing...",2015.0,Objective: Tongxinluo (TXL) has been shown to ...,['Journal Article']
8249,Molecules,Production of Salvianolic Acid B in Roots of S...,"Li, Xiao-Bing; Wang, Wei; Zhou, Guo-Jun; Li, Y...",2012.0,Drying is the most common and fundamental proc...,['Journal Article']
