### This is the code for the web scraping functions using pygooglenews and other packages to scrape Google News according to a matrix of factors including author name, key words found in the titles of the peer-reviewed publications, DOI's and peer-reviewed journal names. 

### Import pygooglenews

In [1]:
from pygooglenews import GoogleNews
import pandas as pd
import numpy as np

gn = GoogleNews() # now global mode? can specify country = US

### Import .csv file 

In [2]:
df = pd.read_csv('UseCase1_Data.csv') # must be in same repository that the jupyter notebook is in


In [3]:
df

Unnamed: 0,Author_Name,Article_Title,Article_DOI
0,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203
1,Jonathan Ospina-Betancurt,The End of Compulsory Gender Verification: Is ...,10.1007/s10508-021-02073-x
2,Katrina Karkazis,The misuses of “biological sex”,10.1016/S0140-6736(19)32764-3
3,Vanessa Heggie,"Testing sex and gender in sports; reinventing,...",10.1016/j.endeavour.2010.09.005
4,April Vannini,"Girl, Interrupted: Interpreting Semenya’s Body...",10.1177/1532708611409536
5,Heather Sykes,Transsexual and Transgender Policies in Sport,10.1123/wspaj.15.1.3
6,Stephane Bermon,Are the New Policies on Hyperandrogenism in El...,10.1080/15265161.2013.776129
7,Francisco Sanchez,The New Policy on Hyperandrogenism in Elite Fe...,10.1080/00224499.2012.752429
8,Ruth Wood,Testosterone and sport: current perspectives,10.1016/j.yhbeh.2011.09.010
9,Emma Hilton,Transgender women in the female category of sp...,10.1007/s40279-020-01389-3


### Function for scraping by author name only

In [11]:
# search = gn.search(‘author name’) #search by author name
def scraping_author(df):
    '''This function scrapes google news for anything matching the 
    scholarly author name in df'''
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Author_Name'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df.iloc[i].loc['Author_Name'],
                'title': item.title,
                'link': item.link
            }
            
            stories.append(story)    
    return stories
scraping_author_df = pd.DataFrame(scraping_author(df)) # this puts the output in a long form dataframe

scraping_author_df  

Unnamed: 0,Author_Name,title,link
0,Laurel Westbrook,Thorpe questions FINA's trans swimming ban: Th...,https://news.google.com/__i/rss/rd/articles/CB...
1,Laurel Westbrook,Transgender Legal Battles: A Timeline - JSTOR ...,https://news.google.com/__i/rss/rd/articles/CB...
2,Laurel Westbrook,Real Estate Transactions for Nov. 24 - Zip06.com,https://news.google.com/__i/rss/rd/articles/CB...
3,Laurel Westbrook,"St. Tammany property transfers, Oct.18-24, 202...",https://news.google.com/__i/rss/rd/articles/CB...
4,Laurel Westbrook,Russell Westbrook Bought a $37 Million House -...,https://news.google.com/__i/rss/rd/articles/CB...
...,...,...,...
883,Carole Hooven,Are There Health Benefits to Feeling Emotions?...,https://news.google.com/__i/rss/rd/articles/CB...
884,Carole Hooven,15 best science and environment books 2021 | S...,https://news.google.com/__i/rss/rd/articles/CB...
885,Carole Hooven,Study debunks one of the most damaging myths a...,https://news.google.com/__i/rss/rd/articles/CB...
886,Carole Hooven,Biological differences give men unfair advanta...,https://news.google.com/__i/rss/rd/articles/CB...


### Function for scraping by DOI

In [15]:
# search = gn.search(‘Article_DOI’) #search by reference to article DOI
def scraping_doi(df):
    '''This function scrapes google news for anything matching the 
    scholarly DOI in df'''
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Article_DOI'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Article_DOI': df.iloc[i].loc['Article_DOI'],
                'title': item.title,
                'link': item.link
            }
            
            stories.append(story)    
    return stories
scraping_doi_df = pd.DataFrame(scraping_doi(df)) # this puts the output in a long form dataframe

scraping_doi_df 

TypeError: quote_from_bytes() expected bytes

### Function for scraping by exact article title 

In [9]:
# search = gn.search(‘Article_Title’) #search by reference to peer reviewed article title
def scraping_title(df):
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Article_Title'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Article_Title': df.iloc[i].loc['Article_Title'],
                'title': item.title,
                'link': item.link
            }
            
            stories.append(story)    
    return stories
scraping_title_df = pd.DataFrame(scraping_title(df)) # this puts the output in a long form dataframe

scraping_title_df

TypeError: quote_from_bytes() expected bytes

### Use keywords in peer-reviewed article titles to reduce irrelevant return items from the scraping function

In [None]:
# this will use code from the keywords search above, tbd 

# get rid of punctuation

# import string

# article_titles = df['Article_Title'].to_string()

# no_punc = article_titles.translate(str.maketrans('', '', string.punctuation))

# print(no_punc)
# print(type(no_punc))

article_titles = df['Article_Title'].to_list() # make article titles a list, must have for following chunk of code to work
print('article_titles: ', article_titles)
print(type(article_titles))


from collections import Counter

stops = {'this', 'that', 'a', 'is', "and", "Determining", "Doing", "the", "People", "of"}    
words = article_titles[0].split()
count = Counter(word for word in words if word not in stops).most_common(10) # we may want to figure out a way to remove all punctuation from titles first

print(count)