### This is the code for the web scraping functions using pygooglenews and other packages to scrape Google News according to a matrix of factors including author name, key words found in the titles of the peer-reviewed publications, DOI's and peer-reviewed journal names. 

### Import pygooglenews

In [49]:
import nltk
from nltk.corpus import stopwords
import re
import string
from collections import Counter

from pygooglenews import GoogleNews
import pandas as pd
import numpy as np

gn = GoogleNews() # now global mode? can specify country = US

In [35]:
import warnings
warnings.filterwarnings('ignore')

### Import .csv file 

In [50]:
df = pd.read_csv('UseCase1_Data.csv') # must be in same repository that the jupyter notebook is in


In [51]:
df
# To do:
# Value error checking
# Write unit tests
# Write documentation for new code
# Update documentation for changes
# Linking frontend and backend stuff
# Repeat documention stuff for other use cases
# Update design docs
# Update package specs in environment
# Continuous integration (WTF LEARN THIS)
# Delete one use case from documentation and stick with only two use cases (Delaney doing 12/4/22)

Unnamed: 0,Author_Name,Article_Title,Article_DOI
0,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203
1,Jonathan Ospina-Betancurt,The End of Compulsory Gender Verification: Is ...,10.1007/s10508-021-02073-x
2,Katrina Karkazis,The misuses of “biological sex”,10.1016/S0140-6736(19)32764-3
3,Vanessa Heggie,"Testing sex and gender in sports; reinventing,...",10.1016/j.endeavour.2010.09.005
4,April Vannini,"Girl, Interrupted: Interpreting Semenya’s Body...",10.1177/1532708611409536
5,Heather Sykes,Transsexual and Transgender Policies in Sport,10.1123/wspaj.15.1.3
6,Stephane Bermon,Are the New Policies on Hyperandrogenism in El...,10.1080/15265161.2013.776129
7,Francisco Sanchez,The New Policy on Hyperandrogenism in Elite Fe...,10.1080/00224499.2012.752429
8,Ruth Wood,Testosterone and sport: current perspectives,10.1016/j.yhbeh.2011.09.010
9,Emma Hilton,Transgender women in the female category of sp...,10.1007/s40279-020-01389-3


In [52]:
df.drop(df.tail(1).index,inplace=True) # drop last n rows because google news functions 
#won't work with NaN or NA values, we need to add an ifelse/judgement to each function. We only can search for strings
df

Unnamed: 0,Author_Name,Article_Title,Article_DOI
0,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203
1,Jonathan Ospina-Betancurt,The End of Compulsory Gender Verification: Is ...,10.1007/s10508-021-02073-x
2,Katrina Karkazis,The misuses of “biological sex”,10.1016/S0140-6736(19)32764-3
3,Vanessa Heggie,"Testing sex and gender in sports; reinventing,...",10.1016/j.endeavour.2010.09.005
4,April Vannini,"Girl, Interrupted: Interpreting Semenya’s Body...",10.1177/1532708611409536
5,Heather Sykes,Transsexual and Transgender Policies in Sport,10.1123/wspaj.15.1.3
6,Stephane Bermon,Are the New Policies on Hyperandrogenism in El...,10.1080/15265161.2013.776129
7,Francisco Sanchez,The New Policy on Hyperandrogenism in Elite Fe...,10.1080/00224499.2012.752429
8,Ruth Wood,Testosterone and sport: current perspectives,10.1016/j.yhbeh.2011.09.010
9,Emma Hilton,Transgender women in the female category of sp...,10.1007/s40279-020-01389-3


### Use keywords in peer-reviewed article titles to reduce irrelevant return items from the scraping function

In [53]:
def key_words(df):
    article_titles = df['Article_Title'].to_list()

    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    punct = set(string.punctuation)
    
    for i in range(len(article_titles)):
        # lowercasing the text
        article_titles[i] = article_titles[i].lower()
        # remove stopwords
        article_titles[i] = " ".join([word for word in article_titles[i].split() if word not in stop_words])
        # remove unicode
        text = article_titles[i].encode(encoding='ascii', errors='ignore').decode()
        article_titles[i] = " ".join([word for word in text.split()])
        # remove market ticker and hashtag
        article_titles[i] = re.sub('\$', '', article_titles[i])
        article_titles[i] = re.sub('\#', '', article_titles[i])
        # remove punctuation
        article_titles[i] = "".join([ch for ch in article_titles[i] if ch not in punct])
        
    res = ' '.join(article_titles)
    res = res.split(' ')
    return Counter(res).most_common(10)

keyword_list = keywords(df)
keyword_list = (list(zip(*keyword_list))[0])
print(keyword_list)
#keywords(df)

('gender', 'sport', 'elite', 'sex', 'female', 'transgender', 'verification', 'sports', 'testing', 'policies')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Function for scraping by author name only

In [54]:
# search = gn.search(‘author name’) #search by author name
def scraping_author(df):
    """This function scrapes google news for anything matching the 
    scholarly author name in df"""
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Author_Name'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df.iloc[i].loc['Author_Name'],
                'Article_Title': df.iloc[i].loc['Article_Title'],
                'Article_DOI': df.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories
scraping_author_df = pd.DataFrame(scraping_author(df))# this puts the output in a long form dataframe/
#scraping_author_df 
key_count = [0]*len(scraping_author_df)
for i in range(len(scraping_author_df)):
    for j in range(len(keyword_list)):
        #print(keyword_list[j])
        if keyword_list[j] in scraping_author_df.iloc[i].loc['news_title']:
            key_count[i] = key_count[i]+1

scraping_author_df['key_count'] = key_count
        

In [42]:
print(scraping_author_df)

          Author_Name                                      Article_Title  \
0    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
1    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
2    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
3    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
4    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
..                ...                                                ...   
843       John Devine            Gender, Steroids, and Fairness in Sport   
844       John Devine            Gender, Steroids, and Fairness in Sport   
845       John Devine            Gender, Steroids, and Fairness in Sport   
846       John Devine            Gender, Steroids, and Fairness in Sport   
847       John Devine            Gender, Steroids, and Fairness in Sport   

                       Article_DOI  \
0         10.1177/0891243213503203   
1         1

### Function for scraping by DOI

In [55]:
# search = gn.search(‘Article_DOI’) #search by reference to article DOI

def scraping_doi(df):
    '''This function scrapes google news for anything matching the 
    scholarly DOI in df'''
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Article_DOI'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df.iloc[i].loc['Author_Name'],
                'Article_Title': df.iloc[i].loc['Article_Title'],
                'Article_DOI': df.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories

scraping_doi_df = pd.DataFrame(scraping_doi(df)) # this puts the output in a long form dataframe

scraping_doi_df 

Unnamed: 0,Author_Name,Article_Title,Article_DOI,news_title,news_link
0,Heather Sykes,Transsexual and Transgender Policies in Sport,10.1123/wspaj.15.1.3,“Debate” about Trans Girls and Women in School...,https://news.google.com/__i/rss/rd/articles/CB...
1,Emma Hilton,Transgender women in the female category of sp...,10.1007/s40279-020-01389-3,FINA introduces a new policy for transgender s...,https://news.google.com/__i/rss/rd/articles/CB...
2,Timothy Roberts,Effect of gender affirming hormones on athleti...,10.1136/bjsports-2020-102329,Effect of gender affirming hormones on athleti...,https://news.google.com/__i/rss/rd/articles/CB...
3,Taryon Knox,Transwomen in elite sport: scientific and ethi...,10.1136/medethics-2018-105208,Gender binary in elite sports should be abando...,https://news.google.com/__i/rss/rd/articles/CB...


### Function for scraping by exact article title 

In [56]:
# search = gn.search(‘Article_Title’) #search by reference to peer reviewed article title
def scraping_title(df):
    stories = []
    for i in range(len(df)):
        search = gn.search(df.iloc[i].loc['Article_Title'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df.iloc[i].loc['Author_Name'],
                'Article_Title': df.iloc[i].loc['Article_Title'],
                'Article_DOI': df.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories

scraping_title_df = pd.DataFrame(scraping_title(df)) # this puts the output in a long form dataframe

scraping_title_df

Unnamed: 0,Author_Name,Article_Title,Article_DOI,news_title,news_link
0,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Transgender Adults Have Higher Rates Of Disabi...,https://news.google.com/__i/rss/rd/articles/CB...
1,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Transgender Legal Battles: A Timeline - JSTOR ...,https://news.google.com/__i/rss/rd/articles/CB...
2,Jonathan Ospina-Betancurt,The End of Compulsory Gender Verification: Is ...,10.1007/s10508-021-02073-x,“They're Chasing Us Away from Sport”: Human Ri...,https://news.google.com/__i/rss/rd/articles/CB...
3,Jonathan Ospina-Betancurt,The End of Compulsory Gender Verification: Is ...,10.1007/s10508-021-02073-x,End sexist scheduling of major sporting events...,https://news.google.com/__i/rss/rd/articles/CB...
4,Katrina Karkazis,The misuses of “biological sex”,10.1016/S0140-6736(19)32764-3,Biology Lecturer’s Comments on Biological Sex ...,https://news.google.com/__i/rss/rd/articles/CB...
...,...,...,...,...,...
334,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,"Without NCAA Action, Effects of Lia Thomas Sit...",https://news.google.com/__i/rss/rd/articles/CB...
335,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Young transgender athletes caught in middle of...,https://news.google.com/__i/rss/rd/articles/CB...
336,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Texas Republicans aim to limit school sports o...,https://news.google.com/__i/rss/rd/articles/CB...
337,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Lia Thomas Teammate: Situation is 'Unfair' and...,https://news.google.com/__i/rss/rd/articles/CB...


In [60]:
scrape_out = scraping_author_df.append([scraping_doi_df, scraping_title_df]) #append all df's together
scrape_out

Unnamed: 0,Author_Name,Article_Title,Article_DOI,news_title,news_link,key_count
0,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Thorpe questions FINA's trans swimming ban: Th...,https://news.google.com/__i/rss/rd/articles/CB...,0.0
1,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Transgender Legal Battles: A Timeline - JSTOR ...,https://news.google.com/__i/rss/rd/articles/CB...,1.0
2,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Real Estate Transactions for Nov. 24 - Zip06.com,https://news.google.com/__i/rss/rd/articles/CB...,0.0
3,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,"St. Tammany property transfers, Oct.18-24, 202...",https://news.google.com/__i/rss/rd/articles/CB...,0.0
4,Laurel Westbrook,"Doing Gender, Determining Gender: Transgender ...",10.1177/0891243213503203,Russell Westbrook Bought a $37 Million House -...,https://news.google.com/__i/rss/rd/articles/CB...,0.0
...,...,...,...,...,...,...
334,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,"Without NCAA Action, Effects of Lia Thomas Sit...",https://news.google.com/__i/rss/rd/articles/CB...,
335,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Young transgender athletes caught in middle of...,https://news.google.com/__i/rss/rd/articles/CB...,
336,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Texas Republicans aim to limit school sports o...,https://news.google.com/__i/rss/rd/articles/CB...,
337,John Devine,"Gender, Steroids, and Fairness in Sport",10.1080/17511321.2017.1404627,Lia Thomas Teammate: Situation is 'Unfair' and...,https://news.google.com/__i/rss/rd/articles/CB...,


In [73]:
scrape_out.size
scrape_out.tail(1)
scrape_out = scrape_out[scrape_out.key_count != 0]
print(scrape_out)

scraping_author_df = scraping_author_df[scraping_author_df.key_count != 0]
print(scraping_author_df)

          Author_Name                                      Article_Title  \
1    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
6    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
7    Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
12   Laurel Westbrook  Doing Gender, Determining Gender: Transgender ...   
15   Katrina Karkazis                    The misuses of “biological sex”   
..                ...                                                ...   
334       John Devine            Gender, Steroids, and Fairness in Sport   
335       John Devine            Gender, Steroids, and Fairness in Sport   
336       John Devine            Gender, Steroids, and Fairness in Sport   
337       John Devine            Gender, Steroids, and Fairness in Sport   
338       John Devine            Gender, Steroids, and Fairness in Sport   

                       Article_DOI  \
1         10.1177/0891243213503203   
6         1