In [2]:
#!pip install pandas numpy scikit-learn gplearn gensim pymatgen

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import logging
import re
import time
from urllib.parse import quote

# Set up logging for debugging and tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to search Google Scholar for papers
def search_google_scholar(query, max_results=5):
    """
    Search Google Scholar for papers matching the query.
    Returns a list of dictionaries with title, URL, and abstract (if available).
    Caveat: Google Scholar may block frequent requests; includes delay to avoid bans.
    """
    url = f"https://scholar.google.com/scholar?q={quote(query)}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        papers = []
        for result in soup.find_all('div', class_='gs_r gs_or gs_scl')[:max_results]:
            title_elem = result.find('h3', class_='gs_rt')
            title = title_elem.text.strip() if title_elem else "No title"
            link = result.find('a')['href'] if result.find('a') else None
            abstract_elem = result.find('div', class_='gs_rs')
            abstract = abstract_elem.text.strip() if abstract_elem else "No abstract"
            papers.append({'title': title, 'url': link, 'abstract': abstract, 'source': 'Google Scholar'})
        logging.info(f"Found {len(papers)} papers for query: {query}")
        time.sleep(2)  # Delay to avoid rate-limiting
        return papers
    except Exception as e:
        logging.error(f"Error searching Google Scholar for {query}: {e}")
        return []

# Function to search arXiv for papers
def search_arxiv(query, max_results=5):
    """
    Search arXiv for preprints matching the query.
    Returns a list of dictionaries with title, URL, and abstract.
    Caveat: arXiv focuses on preprints, which may lack peer review but are open-access.
    """
    url = f"http://export.arxiv.org/api/query?search_query={quote(query)}&start=0&max_results={max_results}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'xml')
        papers = []
        for entry in soup.find_all('entry'):
            title = entry.find('title').text.strip()
            link = entry.find('id').text.strip()
            abstract = entry.find('summary').text.strip()
            papers.append({'title': title, 'url': link, 'abstract': abstract, 'source': 'arXiv'})
        logging.info(f"Found {len(papers)} papers for query: {query}")
        time.sleep(1)  # Delay to avoid rate-limiting
        return papers
    except Exception as e:
        logging.error(f"Error searching arXiv for {query}: {e}")
        return []

# Function to extract bandgap data (placeholder)
def extract_bandgap_data(url, abstract):
    """
    Placeholder for extracting bandgap data from a paper's URL or abstract.
    In practice, requires full-text access or manual review of tables/supplementary info.
    Returns a dictionary with composition, bandgap, etc., with placeholders for manual entry.
    Caveat: Automated extraction from full texts is complex; manual review often needed.
    """
    # Attempt to find bandgap values in abstract (simplified regex example)
    bandgap_match = re.search(r'band\s?gap.*?(\d+\.\d+)\s*eV', abstract, re.IGNORECASE)
    bandgap = float(bandgap_match.group(1)) if bandgap_match else None
    return {
        'composition': 'Unknown',  # Requires full-text parsing or manual entry
        'bandgap_eV': bandgap,
        'source': 'Unknown',
        'phase': 'Unknown',
        'pseudohalide_type': 'Unknown',
        'method': 'Unknown'
    }

# Main data collection function
def collect_bandgap_data():
    """
    Collect bandgap data for pseudohalide perovskites from Google Scholar and arXiv.
    Queries are tailored to common pseudohalides (SCN⁻, HCOO⁻, OCN⁻, BF4⁻, PF6⁻).
    Saves results to a CSV file for further manual curation.
    """
    queries = [
        'thiocyanate perovskite bandgap',
        'formate perovskite bandgap',
        'cyanate perovskite bandgap',
        'tetrafluoroborate perovskite bandgap',
        'hexafluorophosphate perovskite bandgap',
        'FASnI(SCN)2 bandgap',
        'FAPbI3 formate bandgap'
    ]
    
    data = []
    for query in queries:
        # Search Google Scholar
        scholar_papers = search_google_scholar(query)
        for paper in scholar_papers:
            entry = extract_bandgap_data(paper['url'], paper['abstract'])
            entry.update({
                'paper_title': paper['title'],
                'paper_url': paper['url'],
                'abstract': paper['abstract'],
                'data_source': paper['source']
            })
            data.append(entry)
        
        # Search arXiv
        arxiv_papers = search_arxiv(query)
        for paper in arxiv_papers:
            entry = extract_bandgap_data(paper['url'], paper['abstract'])
            entry.update({
                'paper_title': paper['title'],
                'paper_url': paper['url'],
                'abstract': paper['abstract'],
                'data_source': paper['source']
            })
            data.append(entry)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(data)
    output_file = 'pseudohalide_perovskite_bandgaps.csv'
    df.to_csv(output_file, index=False)
    logging.info(f"Saved {len(df)} entries to {output_file}")
    return df

# Function to guide manual data extraction
def guide_manual_extraction(df):
    """
    Print guidance for manually extracting bandgap data from papers.
    Most papers require full-text access to find compositions and bandgaps.
    """
    logging.info("Manual data extraction guidance:")
    for idx, row in df.iterrows():
        if row['composition'] == 'Unknown' or row['bandgap_eV'] is None:
            logging.info(f"Paper: {row['paper_title']}")
            logging.info(f"URL: {row['paper_url']}")
            logging.info("Action: Access the paper, check results or supplementary info for composition, bandgap, phase, pseudohalide type, and method (experimental/DFT).")
            logging.info("---")

if __name__ == "__main__":
    # Collect data
    df = collect_bandgap_data()
    
    # Print DataFrame
    print("\nCollected Data:")
    print(df[['paper_title', 'bandgap_eV', 'composition', 'data_source']])
    
    # Provide guidance for manual extraction
    guide_manual_extraction(df)
    
    # Log next steps
    logging.info("Next steps: Manually review papers to populate 'composition', 'bandgap_eV', 'source', 'phase', 'pseudohalide_type', and 'method' columns.")
    logging.info("Consider using Elsevier/PubMed APIs or institutional access for paywalled papers.")
    logging.info("For DFT supplementation, use pymatgen with VASP/Quantum ESPRESSO to compute bandgaps for new compositions.")

2025-11-26 14:17:17,680 - INFO - Found 0 papers for query: thiocyanate perovskite bandgap
2025-11-26 14:17:21,874 - INFO - Found 5 papers for query: thiocyanate perovskite bandgap
2025-11-26 14:17:23,166 - INFO - Found 0 papers for query: formate perovskite bandgap
2025-11-26 14:17:25,911 - INFO - Found 5 papers for query: formate perovskite bandgap
2025-11-26 14:17:27,233 - INFO - Found 0 papers for query: cyanate perovskite bandgap
2025-11-26 14:17:29,862 - INFO - Found 5 papers for query: cyanate perovskite bandgap
2025-11-26 14:17:31,253 - INFO - Found 0 papers for query: tetrafluoroborate perovskite bandgap
2025-11-26 14:17:33,861 - INFO - Found 5 papers for query: tetrafluoroborate perovskite bandgap
2025-11-26 14:17:35,255 - INFO - Found 0 papers for query: hexafluorophosphate perovskite bandgap
2025-11-26 14:17:37,827 - INFO - Found 5 papers for query: hexafluorophosphate perovskite bandgap
2025-11-26 14:17:39,119 - INFO - Found 0 papers for query: FASnI(SCN)2 bandgap
2025-11-2


Collected Data:
                                          paper_title  bandgap_eV composition  \
0   Accurate first-principle bandgap predictions i...         NaN     Unknown   
1   Perovskite-perovskite tandem photovoltaics wit...        1.20     Unknown   
2   Revealing unusual bandgap shifts with temperat...         NaN     Unknown   
3   Aziridinium lead iodide: a stable, low bandgap...         NaN     Unknown   
4   Efficient all-perovskite tandem solar cells by...        1.76     Unknown   
5   Accurate first-principle bandgap predictions i...         NaN     Unknown   
6   Perovskite-perovskite tandem photovoltaics wit...        1.20     Unknown   
7   Revealing unusual bandgap shifts with temperat...         NaN     Unknown   
8   Aziridinium lead iodide: a stable, low bandgap...         NaN     Unknown   
9   Efficient all-perovskite tandem solar cells by...        1.76     Unknown   
10  Accurate first-principle bandgap predictions i...         NaN     Unknown   
11  Perovsk