In [6]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from urllib.parse import urlparse, parse_qs

In [7]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text, term):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    words = word_tokenize(text, "english")
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update({term.lower(), "htmlhtml", "pdfpdf", "bookb", "citationc"})
    words = [word for word in words if word not in stop_words]
    documents = ' '.join(words)
    return documents

def text_to_vector(df, term):
    # Preprocess the text data
    documents = df['title'].values

    # Apply the preprocessing function to each document
    documents = [preprocess_text(doc, term) for doc in documents]

    # If preprocess_text returns a list of tokens, you might want to join them back into a single string
    documents = [' '.join(doc) if isinstance(doc, list) else doc for doc in documents]
    
    # Convert the text data to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Apply K-Means clustering
    num_clusters = 10
    km = KMeans(n_clusters=num_clusters, random_state=42)
    km.fit(tfidf_matrix)

    # Assign each document to a cluster
    clusters = km.labels_.tolist()
    return clusters

def remove_first_two_brackets(text):
    if pd.isna(text):
        return None
    
    # Use regex to substitute the first two square brackets and their content
    result = re.sub(r'^\[.*?\]\s*(?:\[.*?\]\s*)?', '', str(text))
    return result.strip()

PUBLISHER_MAP = {
    # Academic Publishers & Journals
    "books.google.com": "Google Books",
    "taylorfrancis.com": "Taylor & Francis",
    "api.taylorfrancis.com": "Taylor & Francis",
    "tandfonline.com": "Taylor & Francis Online",
    "link.springer.com": "Springer",
    "nature.com": "Nature Publishing Group",
    "apps.who.int": "World Health Organization IRIS",
    "psycnet.apa.org": "APA PsycNet",
    "thelancet.com": "The Lancet",
    "nejm.org": "New England Journal of Medicine",
    "mdpi.com": "MDPI (Multidisciplinary Digital Publishing Institute)",
    "emerald.com": "Emerald Publishing",
    "sciencedirect.com": "ScienceDirect (Elsevier)",
    "academia.edu": "Academia.edu",
    "cambridge.org": "Cambridge University Press",
    "annualreviews.org": "Annual Reviews",
    "frontiersin.org": "Frontiers",
    "jamanetwork.com": "JAMA Network",
    "scielo.br": "SciELO Brazil",
    "academic.oup.com": "Oxford University Press",
    "ncbi.nlm.nih.gov": "National Center for Biotechnology Information",
    "pmc.ncbi.nlm.nih.gov": "PubMed Central",
    "degruyter.com": "De Gruyter",
    "core.ac.uk": "CORE (COnnecting REpositories)",
    "acpjournals.org": "American College of Physicians Journals",
    "brill.com": "Brill Publishers",
    "onlinelibrary.wiley.com": "Wiley Online Library",
    "journals.plos.org": "PLOS (Public Library of Science)",
    "oecd-ilibrary.org": "OECD iLibrary",
    "science.org": "Science/AAAS",

    # Medical & Health Sciences
    "bmj.com": "British Medical Journal",
    "bjsm.bmj.com": "British Journal of Sports Medicine",
    "bmjopen.bmj.com": "BMJ Open",
    "gh.bmj.com": "BMJ Global Health",
    "jech.bmj.com": "Journal of Epidemiology & Community Health",
    "ahajournals.org": "American Heart Association Journals",
    "journals.lww.com": "Lippincott Williams & Wilkins",
    "cell.com": "Cell Press",
    "karger.com": "Karger Publishers",
    "cochranelibrary.com": "Cochrane Library",

    # Chemistry & Science
    "pubs.acs.org": "American Chemical Society Publications",
    "pubs.rsc.org": "Royal Society of Chemistry",
    "iopscience.iop.org": "IOP Science",

    # Research & Academic Repositories
    "researchgate.net": "ResearchGate",
    "arxiv.org": "arXiv",
    "jstor.org": "JSTOR",
    "philpapers.org": "PhilPapers",
    "papers.ssrn.com": "SSRN (Social Science Research Network)",

    # Professional Organizations
    "publications.aap.org": "American Academy of Pediatrics",
    "census.gov": "United States Census Bureau",
    "bea.gov": "Bureau of Economic Analysis",

    # University & Educational
    "muse.jhu.edu": "Project MUSE (Johns Hopkins University Press)",
    "stern.nyu.edu": "NYU Stern School of Business",

    # Technology & Computing
    "dl.acm.org": "ACM Digital Library",
    "ieeexplore.ieee.org": "IEEE Xplore",

    # Economics & Business
    "aeaweb.org": "American Economic Association",
    "nber.org": "National Bureau of Economic Research",
    "elgaronline.com": "Edward Elgar Publishing",
    "pubsonline.informs.org": "INFORMS PubsOnLine",

    # Open Access & Repositories
    "gatesopenresearch.org": "Gates Open Research",
    "eric.ed.gov": "ERIC (Education Resources Information Center)",

    # Scientific Societies & Organizations
    "royalsocietypublishing.org": "Royal Society Publishing",
    "embopress.org": "EMBO Press",
    "journals.physiology.org": "American Physiological Society",
    "journals.asm.org": "American Society for Microbiology",
    "ascopubs.org": "American Society of Clinical Oncology",
    "pnas.org": "Proceedings of the National Academy of Sciences",

    # Other Notable Publishers
    "heinonline.org": "HeinOnline",
    "mckinsey.com": "McKinsey & Company",
    "scirp.org": "Scientific Research Publishing",
    "atlantis-press.com": "Atlantis Press",

    # Major Academic Publishers & Platforms
    "journals.sagepub.com": "SAGE Publications",
    "ssphplus.ch": "Swiss School of Public Health",
    "aap.onlinelibrary.wiley.com": "American Academy of Pediatrics via Wiley",
    "rupress.org": "Rockefeller University Press",
    "ingentaconnect.com": "Ingenta Connect",
    "ashpublications.org": "American Society of Hematology",
    "content.iospress.com": "IOS Press",
    "journals.uchicago.edu": "University of Chicago Press",
    "nowpublishers.com": "Now Publishers",
    "bristoluniversitypressdigital.com": "Bristol University Press",
    "direct.mit.edu": "MIT Press",

    # Wiley Family of Journals
    "analyticalsciencejournals.onlinelibrary.wiley.com": "Wiley Analytical Science",
    "nph.onlinelibrary.wiley.com": "New Phytologist (Wiley)",
    "esajournals.onlinelibrary.wiley.com": "Ecological Society of America Journals",
    "aspenjournals.onlinelibrary.wiley.com": "Aspen Publishers via Wiley",
    "afspubs.onlinelibrary.wiley.com": "American Fisheries Society via Wiley",
    "wires.onlinelibrary.wiley.com": "Wiley Interdisciplinary Reviews",

    # Cold Spring Harbor Laboratory Press
    "perspectivesinmedicine.cshlp.org": "Cold Spring Harbor Perspectives in Medicine",
    "cshperspectives.cshlp.org": "Cold Spring Harbor Perspectives",
    "genesdev.cshlp.org": "Genes & Development",
    "lifescied.org": "Life Science Education",

    # Institutional Repositories
    "thuvienso.hoasen.edu.vn": "Hoa Sen University Digital Library",
    "bemidjistate.edu": "Bemidji State University",
    "ir.library.oregonstate.edu": "Oregon State University Library",
    "repository.monashhealth.org": "Monash Health Repository",
    "kiu.ac.ug": "Kampala International University",
    "ueaeprints.uea.ac.uk": "University of East Anglia Repository",
    "openresearch.surrey.ac.uk": "University of Surrey Repository",
    "ocf.berkeley.edu": "Open Computing Facility, UC Berkeley",

    # Research Organizations & Databases
    "cabidigitallibrary.org": "CABI Digital Library",
    "library.oapen.org": "OAPEN Library",
    "openknowledge.fao.org": "Food and Agriculture Organization",
    "stacks.cdc.gov": "Centers for Disease Control and Prevention",
    "sidalc.net": "Agricultural Information and Documentation Service of the Americas",
    "agro.icm.edu.pl": "ICM Agro Repository",
    "jstage.jst.go.jp": "J-STAGE (Japan Science and Technology Agency)",
    "hrcak.srce.hr": "HRČAK Portal of Croatian Scientific Journals",
    "sciendo.com": "Sciendo (De Gruyter)",
    "dialnet.unirioja.es": "Dialnet (University of La Rioja)",
    "cyberleninka.ru": "CyberLeninka",
    "elibrary.ru": "Russian Scientific Electronic Library",

    # Professional Organizations & Societies
    "laacha.org": "Latin American Association of Clinical Hepatology",
    "clinicalnutritionjournal.com": "ESPEN (European Society for Clinical Nutrition)",
    "suerf.org": "SUERF - The European Money and Finance Forum",
    "plato.stanford.edu": "Stanford Encyclopedia of Philosophy",
    "wol.iza.org": "IZA World of Labor",

    # Research Networks & Platforms
    "ideas.repec.org": "RePEc (Research Papers in Economics)",
    "aisel.aisnet.org": "AIS Electronic Library",
    "works.hcommons.org": "Humanities Commons",
    "neliti.com": "Neliti (Indonesian Research Repository)",

    # Conference Proceedings
    "bio-conferences.org": "BIO Conferences",
    "matec-conferences.org": "MATEC Web of Conferences",
    "econferences.ru": "Russian Conferences Portal",

    # Educational Platforms
    "rangercollege.simplesyllabus.com": "Ranger College",
    "alamancecc.simplesyllabus.com": "Alamance Community College",

    # Regional/National Journals
    "journal.madonnauniversity.edu.ng": "Madonna University Nigeria",
    "journal.uii.ac.id": "Universitas Islam Indonesia",
    "journals.pu.edu.pk": "University of the Punjab",
    "journals.aerc.edu.pk": "Applied Economics Research Centre",
    "journal.upy.ac.id": "Universitas PGRI Yogyakarta",
    "journal.trunojoyo.ac.id": "University of Trunojoyo Madura",

    # Other Notable Resources
    "elibrary.sugarresearch.com.au": "Sugar Research Australia",
    "torrossa.com": "Torrossa Digital Library",
    "journals.rtu.lv": "Riga Technical University",
    "indianjournals.com": "Indian Journals",
    "fin-izdat.com": "Publishing House FINANCE and CREDIT",
    "voced.edu.au": "VOCEDplus",

    "ngfrepository.org.ng:8443": "Nigeria Governors' Forum",
    "ejournal.almaata.ac.id": "Ejournal Alma Ata Yogyakarta",
    "publishoa.com": "JOURNAL OF ALGEBRAIC STATISTICS",
    "fifteentwentyusa.com": "Fifteen Twenty",
    "rsm.nl": "Rotterdam School of Management, Erasmus University",
    "applied-financial-mathematics.de": "Humboldt-Universität zu Berlin",
    "herald.kokanduni.uz": "Qo‘qon universiteti tomonidan qo‘llab quvvatlanadi",
    "jurnal.peradabanpublishing.com": "Journal Directory of Peradaban Pustaka Malang",
    "ieeca.org": "Journal of Eastern European and Central Asian Research",
    "bircu-journal.com": "Budapest International Research and Critics University",
    "everycrsreport.com": "EveryCRSReport",
    "uujec.org": "Unitarian Universalists for a Just Economic Community",
    "web-journal.ru": "INTERNATIONAL SCIENTIFIC JOURNAL",
    "scienticreview.com": "Global Scientific Review",
    "conferencea.org": "Zien Journals Publishing",
    "ijournal.uz": "Journal of Academic Research and Trends in Educational Sciences ",
    "media.neliti.com": "Neliti",
    "sjird.journalspark.org": "Spectrum Journal of Innovation, Reforms and Development",
    "acdc2007.free.fr": "Autres chiffres du chômage",
    "sea-connect.com": "Sea Connect",
    "itsr.ir": "طرح آمایش سرزمین صنعتی، معدنی و تجاری",
    "ibn.idsi.md": "Instrument Bibliometric National",
    "viirj.org": "Vidyabharati International Interdisciplinary Research Journal",
}

def extract_doi(path_str):
  """
  Returns the first DOI-like string found in a path (if any).
  A basic pattern might be (10.) followed by 4-9 digits, then a slash, etc.
  Adjust the regex as necessary for your needs.
  """
  # This pattern matches something like 10.xxxx/...
  doi_match = re.search(r"(10.\d{4,9}/[\w\-\+():]+?\d+/?)", path_str, re.IGNORECASE)
  if doi_match:
    return doi_match.group(0)
  return None

def extract_extension(path_str):
  """
  Returns the file extension if the last segment of the path has one
  (e.g., 'pdf'), otherwise None.
  """
  last_segment = path_str.split('/')[-1]
  if '.' in last_segment:
    # For example, "EB89_1992-REC-1_eng.pdf?sequence=1"
    # strip query params first if they exist
    last_segment = last_segment.split('?')[0]
    extension = last_segment.split('.')[-1]
    return extension
  return None

def get_publisher(domain_str):
  """
  Maps the domain to a known publisher/host if possible.
  """
  return PUBLISHER_MAP.get(domain_str, "Unknown")

def parse_url(url):
  """
  Main parser function that returns a dict of parsed fields:
  - domain
  - publisher
  - path
  - query_params
  - doi (if found)
  - extension (if found)
  """
  parsed = urlparse(url)
  domain = parsed.netloc.replace('www.', '', 1) if parsed.netloc.startswith('www.') else parsed.netloc
  path = parsed.path
  query_dict = parse_qs(parsed.query) # dictionary of query parameters

  # Attempt to extract a DOI from the path
  doi = extract_doi(path)

  # Attempt to get a file extension (e.g., 'pdf')
  extension = extract_extension(path)

  # Look up a known publisher
  publisher = get_publisher(domain)

  return {
      'domain': domain,
      'publisher': publisher,
      'path': path,
      'query_params': query_dict,
      'doi': doi,
      'extension': extension
  }

def extract_brackets(text):
    if pd.isna(text):  # Handle NaN values
        return None
    match = re.match(r'^\[(.*?)\]', str(text))
    return match.group(1) if match else None


[nltk_data] Downloading package stopwords to /home/suna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/suna/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
%%time
topics = ["finance", "health", "nutrition", "biology", "economic_growth", "economy"]

df_merge = pd.DataFrame()

for topic in topics:
    # Read the CSV file
    df = pd.read_csv(f"../data/{topic}_google_scholar.csv")
    print(f"Processing {topic}...")
    
    # Preprocess the text
    df['clean_title'] = df['title'].apply(lambda x: preprocess_text(x, topic))
    print(f"Preprocessed titles for {topic}.")

    df['type'] = df['title'].apply(extract_brackets)
    print(f"Extracted types for {topic}.")

    df['title'] = df['title'].apply(remove_first_two_brackets)

    # Add the cluster assignments to the DataFrame
    df['cluster'] = text_to_vector(df, topic)
    print(f"Clustered titles for {topic}.")

    # Count words (splitting by whitespace)
    df['word_count'] = df['clean_title'].str.split().str.len()
    print(f"Counted words for {topic}.")

    # Count characters (excluding spaces)
    df['char_count'] = df['clean_title'].str.replace(' ', '').str.len()
    print(f"Counted characters for {topic}.")

    # Add category
    df['category'] = topic
    print(f"Added category for {topic}.")

    parsed_results = df['url'].apply(parse_url).apply(pd.Series)
    df = pd.concat([df, parsed_results], axis=1)
    print(f"Parsed URLs for {topic}.")

    # Append to the merged DataFrame
    df_merge = pd.concat([df_merge, df], ignore_index=True)
    
    # Save the processed DataFrame to a new CSV file
    df_merge.to_csv(f"../data/filtered_dataset.csv", index=False)

Processing finance...
Preprocessed titles for finance.
Extracted types for finance.
Clustered titles for finance.
Counted words for finance.
Counted characters for finance.
Added category for finance.
Parsed URLs for finance.
Processing health...
Preprocessed titles for health.
Extracted types for health.
Clustered titles for health.
Counted words for health.
Counted characters for health.
Added category for health.
Parsed URLs for health.
Processing nutrition...
Preprocessed titles for nutrition.
Extracted types for nutrition.
Clustered titles for nutrition.
Counted words for nutrition.
Counted characters for nutrition.
Added category for nutrition.
Parsed URLs for nutrition.
Processing biology...
Preprocessed titles for biology.
Extracted types for biology.
Clustered titles for biology.
Counted words for biology.
Counted characters for biology.
Added category for biology.
Parsed URLs for biology.
Processing economic_growth...
Preprocessed titles for economic_growth.
Extracted types f

In [9]:
print(f'Shape of merged DataFrame: {df_merge.shape}')
df_merge.head()

Shape of merged DataFrame: (1200, 18)


Unnamed: 0,title,authors,year,description,url,citations,clean_title,type,cluster,word_count,char_count,category,domain,publisher,path,query_params,doi,extension
0,The finance uncertainty multiplier,"I Alfaro, N Bloom, X Lin",2024.0,We show how real and financial frictions ampli...,https://www.journals.uchicago.edu/doi/full/10....,407,uncertainty multiplier,,0,2,21,finance,journals.uchicago.edu,University of Chicago Press,/doi/full/10.1086/726230,{},10.1086/726230,
1,The finance of local government,NP Hepworth,2024.0,Originally published in 1970 this book quickly...,https://books.google.com/books?hl=en&lr=&id=kF...,176,local government,BOOK,0,2,15,finance,books.google.com,Google Books,/books,"{'hl': ['en'], 'id': ['kFczEQAAQBAJ'], 'oi': [...",,
2,Mathematics for economics and finance: methods...,"M Anthony, N Biggs",2024.0,"Accessible, concise, and interactive, this boo...",https://books.google.com/books?hl=en&lr=&id=xW...,95,mathematics economics methods modelling,BOOK,0,4,36,finance,books.google.com,Google Books,/books,"{'hl': ['en'], 'id': ['xWAJEQAAQBAJ'], 'oi': [...",,
3,The technology of decentralized finance (DeFi),"R Auer, B Haslhofer, S Kitzler, P Saggese, F V...",2024.0,Decentralized Finance (DeFi) is a new financia...,https://link.springer.com/article/10.1007/s425...,91,technology decentralized defi,,3,3,27,finance,link.springer.com,Springer,/article/10.1007/s42521-023-00088-8,{},10.1007/s42521,
4,Personal finance,VL Bajtelsmit,2024.0,"Personal Finance, 3rd Edition offers essential...",https://books.google.com/books?hl=en&lr=&id=SN...,36,personal,BOOK,7,1,8,finance,books.google.com,Google Books,/books,"{'hl': ['en'], 'id': ['SNjzEAAAQBAJ'], 'oi': [...",,
