### This is the code for the web scraping functions using pygooglenews and other packages to scrape Google News according to a matrix of factors including author name, key words found in the titles of the peer-reviewed publications, DOI's and peer-reviewed journal names. 

### Import pygooglenews

In [48]:
import nltk
from nltk.corpus import stopwords
import re
import string
from collections import Counter
import difflib as difflib

from pygooglenews import GoogleNews
import pandas as pd
import numpy as np
import streamlit as st

gn = GoogleNews() # now global mode? can specify country = US

In [49]:
import warnings
warnings.filterwarnings('ignore')

## Main Function

In [51]:
def main(df):
    """Main function which accepts a CSV file as input. Invokes (i) all scraping functions, 
    (ii) function which consolidates a list of relevant keywords. Returns a csv file containing
    academic journal article title, author, and DOI appended to related news 
    articles and their corresponding URLS"""
    # must be in same repository that the jupyter notebook is located in
    # df = pd.read_csv("UseCase1_Data.csv")
    
    df = df.dropna(how ='all') # drop last n rows because google news functions 
    #won't work with NaN or NA values, we need to add an ifelse/judgement to each function. We only can search for strings
    df.dropna(how='all', axis=1, inplace=True)
    
    # Matching steps
    # author_cols = [col for col in df.columns if 'author' in col]+[col for col in df.columns if 'Author' in col]
    #title_cols = [col for col in df.columns if 'title' in col]+[col for col in df.columns if 'Title' in col]
    #doi_labels = [col for col in df.columns if 'doi' in col]+[col for col in df.columns if 'DOI' in col]+[col for col in df.columns if 'Doi' in col]
    
    #difflib.get_close_matches('Author_Name', author_cols)
    #difflib.get_close_matches('Title', title_cols)
    
    
    """After matching, RAISING ERRORS for incorrect input, and then do edge test in testing"""  
    # get keywords in the article title to remove useless results
    keyword_list = key_words(df)
    keyword_list = (list(zip(*keyword_list))[0])
    
    # scrap by author name
    scraping_author_df = pd.DataFrame(scraping_author(df))
    key_count = [0]*len(scraping_author_df)
    for i in range(len(scraping_author_df)):
        for j in range(len(keyword_list)):
            if keyword_list[j] in scraping_author_df.iloc[i].loc['news_title']:
                key_count[i] = key_count[i]+1
    
    # know how many key words in each result when scraping by author name
    scraping_author_df['key_count'] = key_count
    
    # scrap by doi
    scraping_doi_df = pd.DataFrame(scraping_doi(df))
    
    # scrap by title
    scraping_title_df = pd.DataFrame(scraping_title(df)) 
    
    # concat all df's together
    lst = [scraping_author_df, scraping_doi_df, scraping_title_df]  # List of your dataframes
    df_result= pd.concat(lst)
    
    # remove irrelevant and duplicate results
    df_result = df_result[df_result['key_count'] != 0]
    df_result.drop_duplicates(subset=['news_link'], inplace = True)

    return df_result
    

In [53]:
# To do:
# Value error checking for header names
# Write unit tests -- Ishika think of cases
# Write documentation for new code
# Update documentation for changes
# Linking frontend and backend stuff
# Repeat documention stuff for other use cases
# Update design docs
# Update package specs in environment
# Continuous integration (WTF LEARN THIS)
# Delete one use case from documentation and stick with only two use cases (Delaney doing 12/4/22)

In [54]:
'''
df = df.dropna(how ='all') # drop last n rows because google news functions 
#won't work with NaN or NA values, we need to add an ifelse/judgement to each function. We only can search for strings
df.dropna(how='all', axis=1, inplace=True)
df

author_cols = [col for col in df.columns if 'author' in col]+[col for col in df.columns if 'Author' in col]
#print(list(df.columns))
print(author_cols)
title_cols = [col for col in df.columns if 'title' in col]+[col for col in df.columns if 'Title' in col]
print(title_cols)
doi_labels = [col for col in df.columns if 'doi' in col]+[col for col in df.columns if 'DOI' in col]+[col for col in df.columns if 'Doi' in col]
print(doi_labels)


# write a function where we get indices of headers which contain the string "Author", "DOI" and "Article Title"
'''

'\ndf = df.dropna(how =\'all\') # drop last n rows because google news functions \n#won\'t work with NaN or NA values, we need to add an ifelse/judgement to each function. We only can search for strings\ndf.dropna(how=\'all\', axis=1, inplace=True)\ndf\n\nauthor_cols = [col for col in df.columns if \'author\' in col]+[col for col in df.columns if \'Author\' in col]\n#print(list(df.columns))\nprint(author_cols)\ntitle_cols = [col for col in df.columns if \'title\' in col]+[col for col in df.columns if \'Title\' in col]\nprint(title_cols)\ndoi_labels = [col for col in df.columns if \'doi\' in col]+[col for col in df.columns if \'DOI\' in col]+[col for col in df.columns if \'Doi\' in col]\nprint(doi_labels)\n\n\n# write a function where we get indices of headers which contain the string "Author", "DOI" and "Article Title"\n'

In [55]:
'''
import difflib as difflib
difflib.get_close_matches('Author_Name', author_cols)
difflib.get_close_matches('Title', title_cols)
'''

"\nimport difflib as difflib\ndifflib.get_close_matches('Author_Name', author_cols)\ndifflib.get_close_matches('Title', title_cols)\n"

### Use keywords in peer-reviewed article titles to reduce irrelevant return items from the scraping function

In [56]:
def key_words(df):
    article_titles = df['Article_Title'].to_list()

    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    punct = set(string.punctuation)
    
    for i in range(len(article_titles)):
        # lowercasing the text
        article_titles[i] = str(article_titles[i]).lower()
        # remove stopwords
        article_titles[i] = " ".join([word for word in article_titles[i].split() if word not in stop_words])
        # remove unicode
        text = article_titles[i].encode(encoding='ascii', errors='ignore').decode()
        article_titles[i] = " ".join([word for word in text.split()])
        # remove market ticker and hashtag
        article_titles[i] = re.sub('\$', '', article_titles[i])
        article_titles[i] = re.sub('\#', '', article_titles[i])
        # remove punctuation
        article_titles[i] = "".join([ch for ch in article_titles[i] if ch not in punct])
        
    res = ' '.join(article_titles)
    res = res.split(' ')
    return Counter(res).most_common(10)

'''
keyword_list = key_words(df)
keyword_list = (list(zip(*keyword_list))[0])
print(keyword_list)
#keywords(df)
'''

'\nkeyword_list = key_words(df)\nkeyword_list = (list(zip(*keyword_list))[0])\nprint(keyword_list)\n#keywords(df)\n'

### Function for scraping by author name only

In [57]:
# search = gn.search(‘author name’) #search by author name
def scraping_author(df):
    """This function scrapes google news for anything matching the 
    scholarly author name in df"""
    #df_author = df.dropna([df.iloc'Author_Name'])
    df_author = df[df.Author_Name.notnull()]

    stories = []
    for i in range(len(df_author)):
        search = gn.search(df_author.iloc[i].loc['Author_Name'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df_author.iloc[i].loc['Author_Name'],
                'Article_Title': df_author.iloc[i].loc['Article_Title'],
                'Article_DOI': df_author.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories

'''
scraping_author_df = pd.DataFrame(scraping_author(df))# this puts the output in a long form dataframe/
#scraping_author_df 
key_count = [0]*len(scraping_author_df)
for i in range(len(scraping_author_df)):
    for j in range(len(keyword_list)):
        #print(keyword_list[j])
        if keyword_list[j] in scraping_author_df.iloc[i].loc['news_title']:
            key_count[i] = key_count[i]+1

scraping_author_df['key_count'] = key_count
'''  

"\nscraping_author_df = pd.DataFrame(scraping_author(df))# this puts the output in a long form dataframe/\n#scraping_author_df \nkey_count = [0]*len(scraping_author_df)\nfor i in range(len(scraping_author_df)):\n    for j in range(len(keyword_list)):\n        #print(keyword_list[j])\n        if keyword_list[j] in scraping_author_df.iloc[i].loc['news_title']:\n            key_count[i] = key_count[i]+1\n\nscraping_author_df['key_count'] = key_count\n"

In [58]:
#scraping_author_df

### Function for scraping by DOI

In [59]:
# search = gn.search(‘Article_DOI’) #search by reference to article DOI

def scraping_doi(df):
    '''This function scrapes google news for anything matching the 
    scholarly DOI in df'''
    stories = []
    df_doi = df[df.Article_DOI.notnull()]
    for i in range(len(df_doi)):
        search = gn.search(df_doi.iloc[i].loc['Article_DOI'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df_doi.iloc[i].loc['Author_Name'],
                'Article_Title': df_doi.iloc[i].loc['Article_Title'],
                'Article_DOI': df_doi.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories
'''
scraping_doi_df = pd.DataFrame(scraping_doi(df)) # this puts the output in a long form dataframe

scraping_doi_df 
'''

'\nscraping_doi_df = pd.DataFrame(scraping_doi(df)) # this puts the output in a long form dataframe\n\nscraping_doi_df \n'

### Function for scraping by exact article title 

In [60]:
# search = gn.search(‘Article_Title’) #search by reference to peer reviewed article title
def scraping_title(df):
    stories = []
    df_article_title = df[df.Article_Title.notnull()]
    for i in range(len(df_article_title)):
        search = gn.search(df_article_title.iloc[i].loc['Article_Title'])
        newsitem = search['entries']
        for item in newsitem:
            story = {
                'Author_Name': df_article_title.iloc[i].loc['Author_Name'],
                'Article_Title': df_article_title.iloc[i].loc['Article_Title'],
                'Article_DOI': df_article_title.iloc[i].loc['Article_DOI'],
                'news_title': item.title, #change to 'news_title' for clarity
                'news_link': item.link
            }
            
            stories.append(story)    
    return stories
'''
scraping_title_df = pd.DataFrame(scraping_title(df)) 
'''

'\nscraping_title_df = pd.DataFrame(scraping_title(df)) \n'

In [61]:
'''

lst = [scraping_author_df, scraping_doi_df, scraping_title_df]  # List of your dataframes
df_result= pd.concat(lst)
'''   

'\n\nlst = [scraping_author_df, scraping_doi_df, scraping_title_df]  # List of your dataframes\ndf_result= pd.concat(lst)\n'

In [62]:
# df_result

In [63]:
'''
scrape_out.size
scrape_out.tail(1)
scrape_out = scrape_out[scrape_out.key_count != 0]

scrape_out.head()
scrape_out.tail()
print(scrape_out)

df_result = df_result[df_result['key_count'] != 0]
'''

"\nscrape_out.size\nscrape_out.tail(1)\nscrape_out = scrape_out[scrape_out.key_count != 0]\n\nscrape_out.head()\nscrape_out.tail()\nprint(scrape_out)\n\ndf_result = df_result[df_result['key_count'] != 0]\n"

In [64]:
'''
scrape_out.drop_duplicates(subset=['news_link'])
scrape_out.head()
scrape_out.tail()
print(scrape_out)


df_result.drop_duplicates(subset=['news_link'], inplace = True)
'''

"\nscrape_out.drop_duplicates(subset=['news_link'])\nscrape_out.head()\nscrape_out.tail()\nprint(scrape_out)\n\n\ndf_result.drop_duplicates(subset=['news_link'], inplace = True)\n"

In [65]:
'''
#sample = {'a':[1, 2, 3],
#          'b': ['a', 'b', 'c']}

# Sample output file, need to change to our output file.
output_df = main(df)

# Display a download button widget for user to get the output file.
st.download_button(
    label="Download data as CSV",
    data=output_df.to_csv().encode('utf-8'),
    file_name='news.csv',
    mime='text/csv',
)
'''

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chocoyao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


False

In [50]:
def _max_width_():
    """
    Display string formatted as Markdown.
    """
    
    max_width_str = f"max-width: 1800px;"
    st.markdown(
        f"""
    <style>
    .reportview-container .main .block-container{{
        {max_width_str}
    }}
    </style>    
    """,
        unsafe_allow_html=True,
    )

# Configures the default settings of the page.
st.set_page_config(page_icon="🎓", page_title="SCIPOP")

# Display an image with its url.
st.image(
    "https://png.pngitem.com/pimgs/s/202-2021802_graduation-cap-emoji-transparent-hd-png-download.png",
    width=100,
)

# Draw Markdown-formatted text, with input as a string.
st.write(
"""
# SCIPOP APP
Upload your article dataset to see the relevant news.\n
#### Dataset guidance:\n 
Column_Name: Author_Name, Article_Title, Article_DOI. Could see example dataset for reference.
"""
)

# Display a file uploader widget. Users can drag/drop their input file.
uploaded_file = st.file_uploader("Upload CSV", type=".csv")

# Checkbox of example file to demo the app.
use_example_file = st.checkbox(
    "Use example file", False, help="Use in-built example file to demo the app"
)

# Path for example file.
if use_example_file:
    uploaded_file = "UseCase1_Data.csv"
    
# Read and preview input file.    
if uploaded_file:
    df = pd.read_csv(uploaded_file)

    st.markdown("### Data preview")
    st.dataframe(df.head())
    

    
output_df = main(df)

# Display a download button widget for user to get the output file.
st.download_button(
    label="Download data as CSV",
    data=output_df.to_csv().encode('utf-8'),
    file_name='news.csv',
    mime='text/csv',
)


2022-12-06 15:42:41.221 
  command:

    streamlit run /opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]
