In [2]:
import scholarly
import requests
import scholarly
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup


In [16]:
import newspaper
import requests
from newspaper import Article


In [32]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from scholarly import scholarly

class AcademicScraper:
    def __init__(self, search_terms=['climate change', 'global warming']):
        self.search_terms = search_terms

    def scrape_google_scholar(self, year_range=(2010, 2024)):
        publications = []
        for term in self.search_terms:
            try:
                search_query = scholarly.search_pubs(term)
                for pub in search_query:
                    pub = scholarly.fill(pub)
                    pub_year = pub.bib.get('pub_year', 0)
                    try:
                        pub_year = int(pub_year)
                    except (ValueError, TypeError):
                        pub_year = 0
                    if year_range[0] <= pub_year <= year_range[1]:
                        publications.append({
                            'title': pub.bib.get('title', ''),
                            'author': pub.bib.get('author', ''),
                            'year': pub_year,
                            'text': pub.bib.get('abstract', ''),
                            'source': 'Google Scholar'
                        })
            except Exception as e:
                print(f"Error in Google Scholar search for term {term}: {e}")
        
        return publications

    def scrape_arxiv(self, year_range=(2010, 2024)):
        base_url = 'http://export.arxiv.org/api/query?'
        publications = []
        
        for term in self.search_terms:
            query = f'search_query=all:{term}&start=0&max_results=100'
            response = requests.get(base_url + query)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'xml')
                for entry in soup.find_all('entry'):
                    published_year = int(entry.published.text[:4])
                    if year_range[0] <= published_year <= year_range[1]:
                        publications.append({
                            'title': entry.title.text,
                            'author': ', '.join([au.text for au in entry.find_all('author')]),
                            'year': published_year,
                            'text': entry.summary.text,
                            'source': 'arXiv'
                        })
        return publications

    def collect_academic_corpus(self, year_range=(2010, 2024)):
        corpus = []
        
        # Directly call scraping methods
        scholar_pubs = self.scrape_google_scholar(year_range)
        arxiv_pubs = self.scrape_arxiv(year_range)
        
        # Combine and convert to DataFrame
        corpus.extend(scholar_pubs)
        corpus.extend(arxiv_pubs)
        
        return pd.DataFrame(corpus)

class MediaScraper:
    def __init__(self, sources=None):
        self.sources = sources or {
            'news_outlets': [
                'https://www.nytimes.com/section/climate',
                'https://www.theguardian.com/environment/climate-change',
                'https://www.reuters.com/sustainability/',
                'https://www.bbc.com/news/science-environment'
            ],
            'science_media': [
                'https://www.nature.com/climate-change',
                'https://www.scientificamerican.com/environment/',
                'https://www.newscientist.com/subject/environment/'
            ]
        }
    
    def scrape_articles(self, year_range):
        # Placeholder implementation as scraping news sites requires proper parsing and rate limiting
        # This function should be implemented using requests, BeautifulSoup, or newspaper3k where applicable
        # Right now, returning an empty list as a placeholder
        return pd.DataFrame(columns=['title', 'author', 'year', 'text', 'source'])

class ClimateArticleCorpus:
    def __init__(self, year_range=(2015, 2024)):
        self.year_range = year_range
        self.academic_scraper = AcademicScraper()
        self.media_scraper = MediaScraper()
    
    def collect_corpus(self):
        # Collect academic and journalistic sources
        academic_df = self.academic_scraper.collect_academic_corpus(self.year_range)
        media_df = self.media_scraper.scrape_articles(self.year_range)
        
        # Combine and clean
        combined_df = pd.concat([academic_df, media_df], ignore_index=True)
        
        # Ensure all text columns are strings and handle NaN values
        combined_df['text'] = combined_df['text'].fillna('').astype(str)
        
        # Additional preprocessing
        combined_df['word_count'] = combined_df['text'].apply(lambda x: len(str(x).split()))
        combined_df['contains_misinformation_indicators'] = combined_df['text'].apply(
            self.detect_potential_misinformation_indicators
        )
        
        return combined_df
    
    def detect_potential_misinformation_indicators(self, text):
        # Placeholder for misinformation detection logic
        # This could use keyword matching, sentiment analysis, etc.
        return False

# Usage
corpus_collector = ClimateArticleCorpus(year_range=(2015, 2024))
climate_corpus = corpus_collector.collect_corpus()

# Save for later analysis
climate_corpus.to_csv('climate_change_corpus.csv', index=False)


Error in Google Scholar search for term climate change: 'dict' object has no attribute 'bib'
Error in Google Scholar search for term global warming: 'dict' object has no attribute 'bib'


In [33]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from scholarly import scholarly

class AcademicScraper:
    def __init__(self, search_terms=['climate change', 'global warming']):
        self.search_terms = search_terms

    def scrape_google_scholar(self, year_range=(2010, 2024)):
        publications = []
        for term in self.search_terms:
            try:
                search_query = scholarly.search_pubs(term)
                for pub in search_query:
                    # Fill the publication details
                    pub = scholarly.fill(pub)
                    if 'bib' in pub:
                        pub_bib = pub['bib']
                        pub_year = pub_bib.get('year', 0)
                        try:
                            pub_year = int(pub_year)
                        except (ValueError, TypeError):
                            pub_year = 0
                        if year_range[0] <= pub_year <= year_range[1]:
                            publications.append({
                                'title': pub_bib.get('title', ''),
                                'author': pub_bib.get('author', ''),
                                'year': pub_year,
                                'text': pub_bib.get('abstract', ''),
                                'source': 'Google Scholar'
                            })
            except Exception as e:
                print(f"Error in Google Scholar search for term {term}: {e}")
        
        return publications

    def scrape_arxiv(self, year_range=(2010, 2024)):
        base_url = 'http://export.arxiv.org/api/query?'
        publications = []
        
        for term in self.search_terms:
            query = f'search_query=all:{term}&start=0&max_results=100'
            response = requests.get(base_url + query)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'xml')
                for entry in soup.find_all('entry'):
                    published_year = int(entry.published.text[:4])
                    if year_range[0] <= published_year <= year_range[1]:
                        publications.append({
                            'title': entry.title.text,
                            'author': ', '.join([au.text for au in entry.find_all('author')]),
                            'year': published_year,
                            'text': entry.summary.text,
                            'source': 'arXiv'
                        })
        return publications

    def collect_academic_corpus(self, year_range=(2010, 2024)):
        corpus = []
        
        # Directly call scraping methods
        scholar_pubs = self.scrape_google_scholar(year_range)
        arxiv_pubs = self.scrape_arxiv(year_range)
        
        # Combine and convert to DataFrame
        corpus.extend(scholar_pubs)
        corpus.extend(arxiv_pubs)
        
        return pd.DataFrame(corpus)

class MediaScraper:
    def __init__(self, sources=None):
        self.sources = sources or {
            'news_outlets': [
                'https://www.nytimes.com/section/climate',
                'https://www.theguardian.com/environment/climate-change',
                'https://www.reuters.com/sustainability/',
                'https://www.bbc.com/news/science-environment'
            ],
            'science_media': [
                'https://www.nature.com/climate-change',
                'https://www.scientificamerican.com/environment/',
                'https://www.newscientist.com/subject/environment/'
            ]
        }
    
    def scrape_articles(self, year_range):
        # Placeholder implementation as scraping news sites requires proper parsing and rate limiting
        # This function should be implemented using requests, BeautifulSoup, or newspaper3k where applicable
        # Right now, returning an empty list as a placeholder
        return pd.DataFrame(columns=['title', 'author', 'year', 'text', 'source'])

class ClimateArticleCorpus:
    def __init__(self, year_range=(2015, 2024)):
        self.year_range = year_range
        self.academic_scraper = AcademicScraper()
        self.media_scraper = MediaScraper()
    
    def collect_corpus(self):
        # Collect academic and journalistic sources
        academic_df = self.academic_scraper.collect_academic_corpus(self.year_range)
        media_df = self.media_scraper.scrape_articles(self.year_range)
        
        # Combine and clean
        combined_df = pd.concat([academic_df, media_df], ignore_index=True)
        
        # Ensure all text columns are strings and handle NaN values
        combined_df['text'] = combined_df['text'].fillna('').astype(str)
        
        # Additional preprocessing
        combined_df['word_count'] = combined_df['text'].apply(lambda x: len(str(x).split()))
        combined_df['contains_misinformation_indicators'] = combined_df['text'].apply(
            self.detect_potential_misinformation_indicators
        )
        
        return combined_df
    
    def detect_potential_misinformation_indicators(self, text):
        # Placeholder for misinformation detection logic
        # This could use keyword matching, sentiment analysis, etc.
        return False

# Usage
corpus_collector = ClimateArticleCorpus(year_range=(2015, 2024))
climate_corpus = corpus_collector.collect_corpus()

# Save for later analysis
climate_corpus.to_csv('climate_change_corpus.csv', index=False)


  m = re.search("cites=[\d+,]*", object["citedby_url"])


KeyboardInterrupt: 

In [34]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from scholarly import scholarly
import time

class AcademicScraper:
    def __init__(self, search_terms=['climate change', 'global warming']):
        self.search_terms = search_terms

    def scrape_google_scholar(self, year_range=(2010, 2024), max_results=20):
        publications = []
        for term in self.search_terms:
            try:
                search_query = scholarly.search_pubs(term)
                count = 0
                for pub in search_query:
                    if count >= max_results:
                        break

                    # Add delay to prevent getting blocked
                    time.sleep(2)
                    
                    # Fill the publication details
                    pub = scholarly.fill(pub)
                    if 'bib' in pub:
                        pub_bib = pub['bib']
                        pub_year = pub_bib.get('year', 0)
                        try:
                            pub_year = int(pub_year)
                        except (ValueError, TypeError):
                            pub_year = 0
                        if year_range[0] <= pub_year <= year_range[1]:
                            publications.append({
                                'title': pub_bib.get('title', ''),
                                'author': pub_bib.get('author', ''),
                                'year': pub_year,
                                'text': pub_bib.get('abstract', ''),
                                'source': 'Google Scholar'
                            })
                    count += 1
            except Exception as e:
                print(f"Error in Google Scholar search for term {term}: {e}")
        
        return publications

    def scrape_arxiv(self, year_range=(2010, 2024)):
        base_url = 'http://export.arxiv.org/api/query?'
        publications = []
        
        for term in self.search_terms:
            query = f'search_query=all:{term}&start=0&max_results=100'
            response = requests.get(base_url + query)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'xml')
                for entry in soup.find_all('entry'):
                    published_year = int(entry.published.text[:4])
                    if year_range[0] <= published_year <= year_range[1]:
                        publications.append({
                            'title': entry.title.text,
                            'author': ', '.join([au.text for au in entry.find_all('author')]),
                            'year': published_year,
                            'text': entry.summary.text,
                            'source': 'arXiv'
                        })
        return publications

    def collect_academic_corpus(self, year_range=(2010, 2024)):
        corpus = []
        
        # Directly call scraping methods
        scholar_pubs = self.scrape_google_scholar(year_range)
        arxiv_pubs = self.scrape_arxiv(year_range)
        
        # Combine and convert to DataFrame
        corpus.extend(scholar_pubs)
        corpus.extend(arxiv_pubs)
        
        return pd.DataFrame(corpus)

# Usage
corpus_collector = ClimateArticleCorpus(year_range=(2015, 2024))
climate_corpus = corpus_collector.collect_corpus()

# Save for later analysis
climate_corpus.to_csv('climate_change_corpus.csv', index=False)


KeyboardInterrupt: 

In [38]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

class MediaScraper:
    def __init__(self, sources=None):
        self.sources = sources or {
            'news_outlets': [
                'https://www.nytimes.com/section/climate',
                'https://www.theguardian.com/environment/climate-change',
                'https://www.reuters.com/sustainability/',
                'https://www.bbc.com/news/science-environment'
            ],
            'science_media': [
                'https://www.nature.com/climate-change',
                'https://www.scientificamerican.com/environment/',
                'https://www.newscientist.com/subject/environment/'
            ]
        }
    
    def scrape_articles(self, year_range):
        articles = []
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        
        for category, urls in self.sources.items():
            for url in urls:
                try:
                    response = requests.get(url, headers=headers)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        articles_list = soup.find_all('article')
                        
                        for article in articles_list:
                            title_tag = article.find(['h1', 'h2', 'h3'])
                            title = title_tag.text.strip() if title_tag else 'No Title'
                            author_tag = article.find('span', class_='byline')
                            author = author_tag.text.strip() if author_tag else 'Unknown'
                            year = 2024  # Placeholder year, this should be extracted based on article metadata if available
                            text_tag = article.find('p')
                            text = text_tag.text.strip() if text_tag else 'No Content'
                            
                            # Append article details if within the year range
                            if year_range[0] <= year <= year_range[1]:
                                articles.append({
                                    'title': title,
                                    'author': author,
                                    'year': year,
                                    'text': text,
                                    'source': url
                                })
                    
                    # Add a delay to prevent overwhelming the server
                    time.sleep(1)
                except Exception as e:
                    print(f"Error scraping {url}: {e}")
        
        return pd.DataFrame(articles)

# Usage
media_scraper = MediaScraper()
media_articles = media_scraper.scrape_articles(year_range=(2015, 2024))

# Save for later analysis
media_articles.to_csv('media_articles.csv', index=False)


In [41]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

class MediaScraper:
    def __init__(self, sources=None):
        self.sources = sources or {
            'news_outlets': [
                'https://www.nytimes.com/section/climate',
                'https://www.theguardian.com/environment/climate-change',
                'https://www.reuters.com/sustainability/',
                'https://www.bbc.com/news/science-environment'
            ],
            'science_media': [
                'https://www.nature.com/climate-change',
                'https://www.scientificamerican.com/environment/',
                'https://www.newscientist.com/subject/environment/'
            ]
        }
        self.scraped_urls = set()  # Track scraped URLs to ensure uniqueness
    
    def scrape_articles(self, year_range, max_articles_per_source=50):
        articles = []
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        
        for category, urls in self.sources.items():
            for url in urls:
                try:
                    response = requests.get(url, headers=headers)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        articles_list = soup.find_all('article')
                        count = 0
                        
                        for article in articles_list:
                            if count >= max_articles_per_source:
                                break

                            link_tag = article.find('a', href=True)
                            if link_tag:
                                article_url = urljoin(url, link_tag['href'])
                                if article_url in self.scraped_urls:
                                    continue

                                self.scraped_urls.add(article_url)
                                article_response = requests.get(article_url, headers=headers)
                                if article_response.status_code == 200:
                                    article_soup = BeautifulSoup(article_response.content, 'html.parser')
                                    
                                    title_tag = article_soup.find(['h1', 'h2', 'h3'])
                                    title = title_tag.text.strip() if title_tag else 'No Title'
                                    
                                    author_tag = article_soup.find('span', class_='byline') or article_soup.find('meta', {'name': 'author'})
                                    author = author_tag['content'].strip() if author_tag and author_tag.has_attr('content') else (author_tag.text.strip() if author_tag else 'Unknown')
                                    
                                    date_tag = article_soup.find('time') or article_soup.find('meta', {'property': 'article:published_time'})
                                    year = 2024  # Default placeholder year
                                    if date_tag:
                                        if date_tag.has_attr('datetime'):
                                            year = int(date_tag['datetime'][:4])
                                        elif date_tag.text.strip():
                                            year = int(date_tag.text.strip()[-4:])
                                    
                                    # Append article details if within the year range
                                    if year_range[0] <= year <= year_range[1]:
                                        articles.append({
                                            'title': title,
                                            'author': author,
                                            'year': year,
                                            'source': article_url,
                                            'category': category
                                        })
                                        count += 1
                    
                    # Add a delay to prevent overwhelming the server
                    time.sleep(1)
                except Exception as e:
                    print(f"Error scraping {url}: {e}")
        
        return pd.DataFrame(articles)

# Usage
media_scraper = MediaScraper()
media_articles = media_scraper.scrape_articles(year_range=(2015, 2024))

# Save for later analysis
media_articles.to_csv('media_articles.csv', index=False)


In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import numpy as np

class AcademicScraper:
    def __init__(self, search_terms=['climate change', 'global warming']):
        self.search_terms = search_terms

    def scrape_arxiv(self, year_range=(1991, 2024), max_results=100):
        base_url = 'http://export.arxiv.org/api/query?'
        publications = []
        seen_titles = set()  # Track seen titles to avoid duplicates
        
        for term in self.search_terms:
            start = 0
            while True:
                query = f'search_query=all:{term}&start={start}&max_results={max_results}'
                try:
                    response = requests.get(base_url + query, timeout=10)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'xml')
                        entries = soup.find_all('entry')
                        if not entries:
                            break
                        
                        for entry in entries:
                            title = entry.title.text.strip()
                            if title in seen_titles:
                                continue
                            
                            seen_titles.add(title)
                            published_year = int(entry.published.text[:4])
                            if year_range[0] <= published_year <= year_range[1]:
                                publications.append({
                                    'title': title,
                                    'author': ', '.join([au.text for au in entry.find_all('author')]),
                                    'year': published_year,
                                    'text': entry.summary.text,
                                    'source': 'arXiv'
                                })
                        
                        # Move to the next batch of results
                        start += max_results
                    else:
                        break
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching arXiv data for term {term}: {e}")
                    break  # Stop further retries if there's an error
        return publications

    def collect_academic_corpus(self, year_range=(1991, 2024)):
        corpus = []
        
        # Directly call scraping methods
        arxiv_pubs = self.scrape_arxiv(year_range)
        
        # Convert to DataFrame
        corpus.extend(arxiv_pubs)
        
        return pd.DataFrame(corpus)

# Usage
academic_scraper = AcademicScraper()
academic_corpus = academic_scraper.collect_academic_corpus(year_range=(1991, 2024))

# Save for later analysis
academic_corpus.to_csv('academic_corpus.csv', index=False)
