Parsing Medline file

In [None]:
from Bio import Medline
import pandas as pd

In [None]:
from Bio import Medline

# Open the output CSV file
with open("table1.csv", "w") as file:
    # Write the header line for clarity
    file.write("Journal\tTitle\tAuthors\tYear\tAbstract\tPaper type\tAffiliation\n")

    # Open the MEDLINE file
    with open("/content/pmc_result.MEDLINE") as handle:
        # Use a try-except block to catch errors during parsing and continue
        for line in handle:
            try:
                record = Medline.read(handle)  # Use Medline.read for individual records
                # Use .get() to handle missing fields
                title = record.get("TI", "No title")
                journal = record.get("JT", "No journal")
                authors = "; ".join(record.get("FAU", ["No authors"]))
                year = record.get("DP", "N/A").split(" ")[0]  # Get year, or N/A if missing
                abstract = record.get("AB", "No abstract")
                paper_type = record.get("PT", "No paper type")
                affiliation = record.get("AD", "No affiliation")

                # Write each record's details into the CSV file
                file.write(f"{journal}\t{title}\t{authors}\t{year}\t{abstract}\t{paper_type}\t{affiliation}\n")
            except KeyError as e:
                print(f"Error processing record: {e}")
                # Continue to the next record in case of error
                continue
            except Exception as e: # Catch any other unexpected errors
                print(f"Unexpected error: {e}")
                continue

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/table1.csv", sep="\t")  # Remove nrows=None
df.columns = ["Journal", "Title", "Authors", "Year", "Abstract", "Paper_type", "Affiliation"]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44394 entries, 0 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      44394 non-null  object 
 1   Title        44394 non-null  object 
 2   Authors      44394 non-null  object 
 3   Year         42748 non-null  float64
 4   Abstract     44191 non-null  object 
 5   Paper_type   44394 non-null  object 
 6   Affiliation  44394 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.4+ MB


In [None]:
df

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type,Affiliation
0,Clinical Microbiology Reviews,Natural-Product-Based Solutions for Tropical I...,"Adegboye, Oyelola; Field, Matt A.; Kupz, Andre...",,About half of the world’s population and 80% o...,"['Journal Article', 'Review']","['Public Health and Tropical Medicine, College..."
1,Transactions of the Royal Society of Tropical ...,"Climate change, malaria and neglected tropical...","Klepac, Petra; Hsieh, Jennifer L; Ducker, Cami...",2024.0,To explore the effects of climate change on ma...,"['Journal Article', 'Review']","['Big Data Institute, Oxford University, Oxfor..."
2,Clinical Infectious Diseases: An Official Publ...,Accelerating Progress Towards the 2030 Neglect...,"Vasconcelos, Andreia; King, Jonathan D; Nunes-...",2024.0,"Over the past decade, considerable progress ha...",['Journal Article'],"['Big Data Institute, Li Ka Shing Centre for H..."
3,Molecules : A Journal of Synthetic Chemistry a...,The Role of Natural Products in Drug Discovery...,"Cheuka, Peter Mubanga; Mayoka, Godfrey; Mutai,...",2017.0,Endemic in 149 tropical and subtropical countr...,"['Journal Article', 'Review']","['Department of Chemistry, University of Cape ..."
4,Infectious Diseases of Poverty,Potentials of marine natural products against ...,"Nweze, Justus Amuche; Mbaoji, Florence N.; Li,...",2021.0,Background: Malaria and neglected communicable...,"['Journal Article', 'Review']",['grid.418329.50000 0004 1774 8517Guangxi Key ...
...,...,...,...,...,...,...,...
44389,Proceedings of the Royal Society of Medicine,Immunosuppression and its complications.,"Woodruff, M.",1969.0,No abstract,"['Journal Article', 'Review']",No affiliation
44390,BMC Veterinary Research,"Dog ecology and demography in Antananarivo, 2007.","Ratsitorahina, Maherisoa; Rasambainarivo, Jhon...",2009.0,Background: Rabies is a widespread disease in ...,['Journal Article'],"[""Unité d'Epidémiologie, Institut Pasteur de M..."
44391,BMC Veterinary Research,Effect of adjuvants on the humoral immune resp...,"Kateregga, John; Lubega, George W; Lindblad, E...",2012.0,Background: We investigated several adjuvants ...,['Journal Article'],"['College of Veterinary Medicine, Animal Resou..."
44392,PLoS ONE,Morphological Characterization and Quantificat...,"Du, Huan; Lv, Pin; Ayouz, Mehdi; Besserer, Arn...",2016.0,Continuous observation was performed using con...,['Journal Article'],"['LGPM, CentraleSupelec, Université Paris-Sacl..."


Screening

In [None]:
table_1_with_PT = df[~df["Paper_type"].isin(["N/A", ""])]
table_1_with_PT = table_1_with_PT.dropna(subset=["Paper_type"])
table_1_with_PT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44394 entries, 0 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      44394 non-null  object 
 1   Title        44394 non-null  object 
 2   Authors      44394 non-null  object 
 3   Year         42748 non-null  float64
 4   Abstract     44191 non-null  object 
 5   Paper_type   44394 non-null  object 
 6   Affiliation  44394 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.4+ MB


#drop records without Paper type

In [None]:
table_1_with_PT = df.drop(df[df["Paper_type"] == "No paper type"].index)
table_1_with_PT

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type,Affiliation
0,Clinical Microbiology Reviews,Natural-Product-Based Solutions for Tropical I...,"Adegboye, Oyelola; Field, Matt A.; Kupz, Andre...",,About half of the world’s population and 80% o...,"['Journal Article', 'Review']","['Public Health and Tropical Medicine, College..."
1,Transactions of the Royal Society of Tropical ...,"Climate change, malaria and neglected tropical...","Klepac, Petra; Hsieh, Jennifer L; Ducker, Cami...",2024.0,To explore the effects of climate change on ma...,"['Journal Article', 'Review']","['Big Data Institute, Oxford University, Oxfor..."
2,Clinical Infectious Diseases: An Official Publ...,Accelerating Progress Towards the 2030 Neglect...,"Vasconcelos, Andreia; King, Jonathan D; Nunes-...",2024.0,"Over the past decade, considerable progress ha...",['Journal Article'],"['Big Data Institute, Li Ka Shing Centre for H..."
3,Molecules : A Journal of Synthetic Chemistry a...,The Role of Natural Products in Drug Discovery...,"Cheuka, Peter Mubanga; Mayoka, Godfrey; Mutai,...",2017.0,Endemic in 149 tropical and subtropical countr...,"['Journal Article', 'Review']","['Department of Chemistry, University of Cape ..."
4,Infectious Diseases of Poverty,Potentials of marine natural products against ...,"Nweze, Justus Amuche; Mbaoji, Florence N.; Li,...",2021.0,Background: Malaria and neglected communicable...,"['Journal Article', 'Review']",['grid.418329.50000 0004 1774 8517Guangxi Key ...
...,...,...,...,...,...,...,...
44389,Proceedings of the Royal Society of Medicine,Immunosuppression and its complications.,"Woodruff, M.",1969.0,No abstract,"['Journal Article', 'Review']",No affiliation
44390,BMC Veterinary Research,"Dog ecology and demography in Antananarivo, 2007.","Ratsitorahina, Maherisoa; Rasambainarivo, Jhon...",2009.0,Background: Rabies is a widespread disease in ...,['Journal Article'],"[""Unité d'Epidémiologie, Institut Pasteur de M..."
44391,BMC Veterinary Research,Effect of adjuvants on the humoral immune resp...,"Kateregga, John; Lubega, George W; Lindblad, E...",2012.0,Background: We investigated several adjuvants ...,['Journal Article'],"['College of Veterinary Medicine, Animal Resou..."
44392,PLoS ONE,Morphological Characterization and Quantificat...,"Du, Huan; Lv, Pin; Ayouz, Mehdi; Besserer, Arn...",2016.0,Continuous observation was performed using con...,['Journal Article'],"['LGPM, CentraleSupelec, Université Paris-Sacl..."


In [None]:
table_1_with_PT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44394 entries, 0 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      44394 non-null  object 
 1   Title        44394 non-null  object 
 2   Authors      44394 non-null  object 
 3   Year         42748 non-null  float64
 4   Abstract     44191 non-null  object 
 5   Paper_type   44394 non-null  object 
 6   Affiliation  44394 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.4+ MB


#drop records with Paper type: Review and co

In [None]:
table_1_filter = table_1_with_PT.drop(table_1_with_PT[table_1_with_PT["Paper_type"] == "['Journal Article', 'Review']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33260 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      33260 non-null  object 
 1   Title        33260 non-null  object 
 2   Authors      33260 non-null  object 
 3   Year         32006 non-null  float64
 4   Abstract     33143 non-null  object 
 5   Paper_type   33260 non-null  object 
 6   Affiliation  33260 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'News']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33105 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      33105 non-null  object 
 1   Title        33105 non-null  object 
 2   Authors      33105 non-null  object 
 3   Year         31856 non-null  float64
 4   Abstract     32988 non-null  object 
 5   Paper_type   33105 non-null  object 
 6   Affiliation  33105 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Editorial']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32733 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32733 non-null  object 
 1   Title        32733 non-null  object 
 2   Authors      32733 non-null  object 
 3   Year         31497 non-null  float64
 4   Abstract     32617 non-null  object 
 5   Paper_type   32733 non-null  object 
 6   Affiliation  32733 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Published Erratum']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32730 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32730 non-null  object 
 1   Title        32730 non-null  object 
 2   Authors      32730 non-null  object 
 3   Year         31494 non-null  float64
 4   Abstract     32614 non-null  object 
 5   Paper_type   32730 non-null  object 
 6   Affiliation  32730 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Case Report']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32530 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32530 non-null  object 
 1   Title        32530 non-null  object 
 2   Authors      32530 non-null  object 
 3   Year         31301 non-null  float64
 4   Abstract     32414 non-null  object 
 5   Paper_type   32530 non-null  object 
 6   Affiliation  32530 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Journal Article', 'Review']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32530 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32530 non-null  object 
 1   Title        32530 non-null  object 
 2   Authors      32530 non-null  object 
 3   Year         31301 non-null  float64
 4   Abstract     32414 non-null  object 
 5   Paper_type   32530 non-null  object 
 6   Affiliation  32530 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Letter']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32328 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32328 non-null  object 
 1   Title        32328 non-null  object 
 2   Authors      32328 non-null  object 
 3   Year         31108 non-null  float64
 4   Abstract     32212 non-null  object 
 5   Paper_type   32328 non-null  object 
 6   Affiliation  32328 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter = table_1_filter.drop(table_1_filter[table_1_filter["Paper_type"] == "['Journal Article', 'Comment']"].index)
table_1_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32208 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      32208 non-null  object 
 1   Title        32208 non-null  object 
 2   Authors      32208 non-null  object 
 3   Year         30999 non-null  float64
 4   Abstract     32092 non-null  object 
 5   Paper_type   32208 non-null  object 
 6   Affiliation  32208 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB


In [None]:
table_1_filter

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type,Affiliation
2,Clinical Infectious Diseases: An Official Publ...,Accelerating Progress Towards the 2030 Neglect...,"Vasconcelos, Andreia; King, Jonathan D; Nunes-...",2024.0,"Over the past decade, considerable progress ha...",['Journal Article'],"['Big Data Institute, Li Ka Shing Centre for H..."
6,The American Journal of Tropical Medicine and ...,Integrating Neglected Tropical Disease and Imm...,"Mwingira, Upendo John; Means, Arianna Rubin; C...",2016.0,Global health practitioners are increasingly a...,['Journal Article'],['Neglected Tropical Disease Control Programme...
7,Mycoses,Epidemiological cut‐off values for itraconazol...,"Nyuykonge, Bertrand; Siddig, Emmanuel E.; Mhmo...",2022.0,Background: Eumycetoma is a neglected tropical...,['Journal Article'],['Department of Medical Microbiology and Infec...
8,PLoS Neglected Tropical Diseases,Individual adherence to mass drug administrati...,"Hardwick, Robert J.; Truscott, James E.; Oswal...",2021.0,We present a general framework which describes...,['Journal Article'],['London Centre for Neglected Tropical Disease...
11,Philosophical Transactions of the Royal Societ...,Introduction to the special issue: challenges ...,"Forbes, Kathryn; Basáñez, Maria-Gloria; Hollin...",,Twenty neglected tropical diseases (NTDs) are ...,['Journal Article'],['London Centre for Neglected Tropical Disease...
...,...,...,...,...,...,...,...
44387,Parasitology,Molecular evidence of three closely related sp...,"Uhrovič, Dalibor; Oros, Mikuláš; Kudlai, Olena...",,,['Journal Article'],"['Institute of Parasitology, Slovak Academy of..."
44390,BMC Veterinary Research,"Dog ecology and demography in Antananarivo, 2007.","Ratsitorahina, Maherisoa; Rasambainarivo, Jhon...",2009.0,Background: Rabies is a widespread disease in ...,['Journal Article'],"[""Unité d'Epidémiologie, Institut Pasteur de M..."
44391,BMC Veterinary Research,Effect of adjuvants on the humoral immune resp...,"Kateregga, John; Lubega, George W; Lindblad, E...",2012.0,Background: We investigated several adjuvants ...,['Journal Article'],"['College of Veterinary Medicine, Animal Resou..."
44392,PLoS ONE,Morphological Characterization and Quantificat...,"Du, Huan; Lv, Pin; Ayouz, Mehdi; Besserer, Arn...",2016.0,Continuous observation was performed using con...,['Journal Article'],"['LGPM, CentraleSupelec, Université Paris-Sacl..."


In [None]:
#table_1_filter[table_1_filter["Abstract"] == "NaN"]
#table_1_filter

In [None]:
#detect records without abstract
table_1_filter[table_1_filter["Abstract"] == "No abstract"]

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type,Affiliation
38,PLoS Neglected Tropical Diseases,The global burden of disease study 2013: What ...,"Herricks, Jennifer R.; Hotez, Peter J.; Wanga,...",2017.0,No abstract,['Journal Article'],['Department of Pediatrics and Molecular Virol...
59,Marine Drugs,XVI International Symposium on Marine Natural ...,"Pedrosa, Rui; P. Gaudêncio, Susana; Vasconcelo...",2020.0,No abstract,['Journal Article'],['MARE—Marine and Environmental Science Centre...
71,PLoS Neglected Tropical Diseases,Research Priorities for Neglected Infectious D...,"Dujardin, Jean-Claude; Herrera, Socrates; do R...",2010.0,No abstract,['Journal Article'],"['Institute of Tropical Medicine, Antwerp, Bel..."
83,PLoS Neglected Tropical Diseases,Strategy for a globally coordinated response t...,"Williams, David J.; Faiz, Mohd Abul; Abela-Rid...",2019.0,No abstract,['Journal Article'],"['Australian Venom Research Unit, Department o..."
89,PLoS Neglected Tropical Diseases,Hepatitis B virus infection as a neglected tro...,"O’Hara, Geraldine A.; McNaughton, Anna L.; Map...",2017.0,No abstract,['Journal Article'],"['Faculty of Infectious and Tropical Diseases,..."
...,...,...,...,...,...,...,...
44312,Thorax,MANAGEMENT OF AN ACUTE EXACERBATION OF COPD: A...,"Johnson, M; Stevenson, R",2002.0,No abstract,['Journal Article'],No affiliation
44330,Journal of the Royal Society of Medicine,Antimicrobial responses in amphibia.,"Turner, R J",1979.0,No abstract,['Journal Article'],No affiliation
44331,The British Journal of Radiology,Molecular radiotherapy — the radionuclide raffle?,"Gaze, M N; Flux, G D",2010.0,No abstract,['Journal Article'],"['Department of Oncology, University College L..."
44336,Gut,"Colorectal cancer: a tale of two sides, or a c...","Yamauchi, Mai; Lochhead, Paul; Morikawa, Teppe...",2012.0,No abstract,['Journal Article'],"['Department of Medical Oncology, Dana-Farber ..."


In [None]:
#drop records without abstracts
table_1_with_abstracts = table_1_filter.drop(table_1_filter[table_1_filter["Abstract"] == "No abstract"].index)
#table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30033 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      30033 non-null  object 
 1   Title        30033 non-null  object 
 2   Authors      30033 non-null  object 
 3   Year         28838 non-null  float64
 4   Abstract     29917 non-null  object 
 5   Paper_type   30033 non-null  object 
 6   Affiliation  30033 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.8+ MB


In [None]:
#drop records without abstracts
table_1_with_abstracts = table_1_with_abstracts[~table_1_with_abstracts["Abstract"].isin(["N/A", ""])]
table_1_with_abstracts = table_1_with_abstracts.dropna(subset=["Abstract"])  # Drops rows with NaN in the "Abstract" column
table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29917 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      29917 non-null  object 
 1   Title        29917 non-null  object 
 2   Authors      29917 non-null  object 
 3   Year         28763 non-null  float64
 4   Abstract     29917 non-null  object 
 5   Paper_type   29917 non-null  object 
 6   Affiliation  29917 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.8+ MB


In [None]:
#drop records without Year
table_1_with_abstracts = table_1_with_abstracts[~table_1_with_abstracts["Year"].isin(["N/A", ""])]
table_1_with_abstracts = table_1_with_abstracts.dropna(subset=["Year"])  # Drops rows with NaN in the "Abstract" column
#table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28763 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      28763 non-null  object 
 1   Title        28763 non-null  object 
 2   Authors      28763 non-null  object 
 3   Year         28763 non-null  float64
 4   Abstract     28763 non-null  object 
 5   Paper_type   28763 non-null  object 
 6   Affiliation  28763 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.8+ MB


In [None]:
#drop records without Affiliation
table_1_with_abstracts = table_1_with_abstracts[~table_1_with_abstracts["Affiliation"].isin(["N/A", ""])]
table_1_with_abstracts = table_1_with_abstracts.dropna(subset=["Affiliation"])  # Drops rows with NaN in the "Abstract" column
#table_1_with_abstracts.to_csv("table_1_with_abstracts.csv", index=False)
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28763 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      28763 non-null  object 
 1   Title        28763 non-null  object 
 2   Authors      28763 non-null  object 
 3   Year         28763 non-null  float64
 4   Abstract     28763 non-null  object 
 5   Paper_type   28763 non-null  object 
 6   Affiliation  28763 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.8+ MB


In [None]:
#drop records without affiliation
table_1_with_affiliation = table_1_with_abstracts.drop(table_1_with_abstracts[table_1_with_abstracts["Affiliation"] == "No affiliation"].index)
table_1_with_affiliation.to_csv("table_1_with_affiliation.csv", index=False)
table_1_with_affiliation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26040 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      26040 non-null  object 
 1   Title        26040 non-null  object 
 2   Authors      26040 non-null  object 
 3   Year         26040 non-null  float64
 4   Abstract     26040 non-null  object 
 5   Paper_type   26040 non-null  object 
 6   Affiliation  26040 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.6+ MB


In [None]:
table_1_with_affiliation

Unnamed: 0,Journal,Title,Authors,Year,Abstract,Paper_type,Affiliation
2,Clinical Infectious Diseases: An Official Publ...,Accelerating Progress Towards the 2030 Neglect...,"Vasconcelos, Andreia; King, Jonathan D; Nunes-...",2024.0,"Over the past decade, considerable progress ha...",['Journal Article'],"['Big Data Institute, Li Ka Shing Centre for H..."
6,The American Journal of Tropical Medicine and ...,Integrating Neglected Tropical Disease and Imm...,"Mwingira, Upendo John; Means, Arianna Rubin; C...",2016.0,Global health practitioners are increasingly a...,['Journal Article'],['Neglected Tropical Disease Control Programme...
7,Mycoses,Epidemiological cut‐off values for itraconazol...,"Nyuykonge, Bertrand; Siddig, Emmanuel E.; Mhmo...",2022.0,Background: Eumycetoma is a neglected tropical...,['Journal Article'],['Department of Medical Microbiology and Infec...
8,PLoS Neglected Tropical Diseases,Individual adherence to mass drug administrati...,"Hardwick, Robert J.; Truscott, James E.; Oswal...",2021.0,We present a general framework which describes...,['Journal Article'],['London Centre for Neglected Tropical Disease...
12,PLoS Neglected Tropical Diseases,Co-authorship Network Analysis: A Powerful Too...,"Morel, Carlos Medicis; Serruya, Suzanne Jacob;...",2009.0,Background: New approaches and tools were need...,['Journal Article'],['National Institute for Science and Technolog...
...,...,...,...,...,...,...,...
44385,Medical Principles and Practice,Occupational Injuries Prone to Infectious Risk...,"Omar, Abeer A.; Abdo, Naglaa M.; Salama, Mona ...",2015.0,Objective: The study aimed at determining the ...,['Journal Article'],"['Infection Control Directorate, Ministry of H..."
44390,BMC Veterinary Research,"Dog ecology and demography in Antananarivo, 2007.","Ratsitorahina, Maherisoa; Rasambainarivo, Jhon...",2009.0,Background: Rabies is a widespread disease in ...,['Journal Article'],"[""Unité d'Epidémiologie, Institut Pasteur de M..."
44391,BMC Veterinary Research,Effect of adjuvants on the humoral immune resp...,"Kateregga, John; Lubega, George W; Lindblad, E...",2012.0,Background: We investigated several adjuvants ...,['Journal Article'],"['College of Veterinary Medicine, Animal Resou..."
44392,PLoS ONE,Morphological Characterization and Quantificat...,"Du, Huan; Lv, Pin; Ayouz, Mehdi; Besserer, Arn...",2016.0,Continuous observation was performed using con...,['Journal Article'],"['LGPM, CentraleSupelec, Université Paris-Sacl..."


In [None]:
# Group the DataFrame by year and count the number of publications in each year
publications_by_year = table_1_with_affiliation.groupby("Year").size().reset_index(name="Publication Count")

# Display the result
publications_by_year

Unnamed: 0,Year,Publication Count
0,1969.0,1
1,1985.0,1
2,1987.0,2
3,1988.0,1
4,1990.0,2
5,1994.0,1
6,1997.0,2
7,1998.0,5
8,1999.0,8
9,2000.0,16


CONFINE TO PAPERS FROM 2000 TO 2024 ONLY

In [None]:
# Convert the "Year" column to numeric, coercing errors to NaN
table_1_with_affiliation['Year'] = pd.to_numeric(table_1_with_affiliation['Year'], errors='coerce')

# Filter for papers published between 2000 and 2024 (inclusive)
filtered_df = table_1_with_affiliation[(table_1_with_affiliation['Year'] >= 2000) & (table_1_with_affiliation['Year'] <= 2024)]

# ... (Rest of my code)

# Save the filtered DataFrame to a CSV file
filtered_df.to_csv("filtered_table.csv", index=False)

# Display info on the filtered DataFrame
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26004 entries, 2 to 44393
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Journal      26004 non-null  object 
 1   Title        26004 non-null  object 
 2   Authors      26004 non-null  object 
 3   Year         26004 non-null  float64
 4   Abstract     26004 non-null  object 
 5   Paper_type   26004 non-null  object 
 6   Affiliation  26004 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.6+ MB


PYTHON SCRIPT TO SCREEN FOR ELIGIBILITY

In [None]:
import pandas as pd
import re

# Load the DataFrame (assuming the file is named 'filtered_table.csv')
filtered_df = pd.read_csv("filtered_table.csv")

# Define keywords for natural products, NTDs/vectors, and treatments
natural_products = r"\b(plant|marine|sea|microbe|essential oil|phytochemical|alkaloid|flavonoid|terpenoid|steroid|ocean|microorganism|bacteria|fungi|animal|algae|extract|alga)\b"
ntds_diseases = r"\b(NTDs?|neglected tropical diseases?|Chagas|leishmaniasis|schistosomiasis|onchocerciasis|lymphatic filariasis|Buruli ulcer|dengue|dracunculiasis|trypanosomiasis|sleeping sickness|river blindness|elephantiasis|vector|sandfly|blackfly|mosquito|snail|chikungunya|echinococcosis|foodborne trematodiases|leprosy|mycetoma|mycoses|chromoblastomycosis|rabies|scabies|snakebite envenoming|snakebite|snake|venom|taeniasis|cysticercosis|trachoma|yaws|zika virus|zika|ascariasis|soil-transmitted helminthiases|helminthiases|helminth|brucellosis|brucella|ulcerans|toxoplasmosis|toxoplasma|tunga|penetrans|tungiasis|ascaris|fasciola|echinococcus|granulosus|hepatica|gigantica|ochengi|volvulus|malayi|pahangi|trichura|mansoni|haematobium|japonicum|solium|ceylanicum|toxoplasma|gondii|trypanosoma|leishmania|brucei|cruzi|congolense|rhodesiense|donovani|chagasi|mexicana|amazonensis|infantum|braziliensis|tropica|tarentolae|pifanoi|enriettii|major|gambisense)\b"
ntds_vectors = r"\b(aedes|Culex|quinquefasciatus|Anopheles|funestus|gambiae|pharoensis|antennatus|aegypti|stephensi|vector|Biomphalaria|glabrata|pfeifferi|albopictus|Ae.|An.|Stegomyia|subpictus|vishnui|Vittatus|Oncomelania|hupensis|Lymnaea|acuminata)\b"
treatment_terms = r"\b(treat|treated|treatment|therapy|therapeutic|efficacy|effective|inhibit|inhibited|inhibition|control|controlled|eliminate|eliminated|reduce|reduced|reduction|ameliorate|ameliorated|prevent|prevention)\b"

# Function to check for eligibility
def check_eligibility(abstract):
    # Search for matches in the abstract
    np_match = re.search(natural_products, abstract, re.IGNORECASE)
    ntds_match = re.search(ntds_diseases, abstract, re.IGNORECASE)
    ntd_match = re.search(ntds_vectors, abstract, re.IGNORECASE)
    treatment_match = re.search(treatment_terms, abstract, re.IGNORECASE)

    # Ensure natural products are mentioned as treatments for NTDs or vectors
    return bool(np_match) and (bool(ntds_match) or bool(ntd_match)) and bool(treatment_match)

# Apply the function to the 'Abstract' column and create a new 'Eligible' column
filtered_df['Eligible'] = filtered_df['Abstract'].apply(check_eligibility)

# Filter the DataFrame to include only eligible publications
eligible_publications = filtered_df[filtered_df['Eligible']]

# Display the eligible publications
print(eligible_publications)

# Save the eligible publications to a new CSV file
eligible_publications.to_csv("eligible_publications.csv", index=False)


                                        Journal  \
16       Current opinion in infectious diseases   
17                                    Molecules   
18     BMC Complementary Medicine and Therapies   
20                                    Molecules   
24                                    Molecules   
...                                         ...   
25978                                  PLoS ONE   
25988                          BMC Microbiology   
25992           Indian Journal of Ophthalmology   
25995           The Pan African Medical Journal   
26001                   BMC Veterinary Research   

                                                   Title  \
16     Advances in Chagas disease drug development: 2...   
17     Secondary Metabolites from Vietnamese Marine I...   
18     Therapeutic potential of Indonesian plant extr...   
20     Chemical Composition, Antimicrobial and Antipa...   
24     Bixa orellana L. (Bixaceae) and Dysphania ambr...   
...                        