# The Headline Collector
---

## Intro
This notebook contains the code for collecting headlines from existing news aggregators.
The purpose is two-fold: to test the reliability of news aggregators in event detection,
and to provide a corpus of headlines on which to perform feature extraction.

In [19]:
from pygooglenews import GoogleNews

def headline_importer() -> dict:

    gn = GoogleNews()

    # retrieves top stories
    top = gn.top_news()

    return top

## Persistence

This prepare to save the data to disk or to database, depending on the desired storage
medium.

### API to DataFrame

This cell converts data from the API call into a pandas DataFrame.

In [20]:
import pandas as pd

def headline_to_dataframe(headline_dict: dict):
    """
    Converts data from the Google News API to a pandas DataFrame for analysis.

    :return: pandas DataFrame
    """
    # key/value pairs of column names from API to new data schema
    column_names = {'published': 'headline_time_created', 'id': 'headline_id', 'title': 'headline_text',
                 'title_detail': 'headline_language', 'source': 'headline_source', 'link': 'headline_url'}

    # create new pandas DataFrame from API dictionary's 'entries' values
    headline_df = pd.DataFrame.from_dict(headline_dict['entries'])

    # keep only necessary columns
    headline_df = headline_df[['published', 'id', 'title', 'title_detail', 'source', 'link']]

    # overrides nested dictionaries in API values with Series object dervices from nest values
    headline_df['title_detail'] = headline_df['title_detail'].apply(pd.Series)['language']
    headline_df['source'] = headline_df['source'].apply(pd.Series)['title']

    # renames columns to match new data schema
    headline_df.rename(columns=column_names, inplace=True)

    return headline_df, column_names

### DataFrame to File

This cell checks to see if data in the DataFrame is already in our data file, and only saves data if it is
not already present.

In [25]:
import os

def dataframe_to_file(api_dataframe, column_names: dict) -> None:
    """
    Checks if data in DataFrame is present in target file, and saves DataFrame data that is not already
    present.
    :param dataframe: a DataFrame containing headline data
    :return: None
    """
    # imports existing data file, if it already exists: if it doesn't exist, skips to creating file
    directory_path = os.path.join('data', 'output')
    file_path = os.path.join(directory_path, 'headlines.csv')
    if os.path.isfile(file_path):
        # creates a DataFrame object out of the last 1000 rows in our data file
        existing_dataframe = pd.read_csv(file_path, sep='\t')
        # sort existing DataFrame by headline_time_created

        # limit to last 1000 rows

        # compares the headline_id column in our data file DataFrame to the headline_id column in our
        # headline DataFrame


        # parses out only the non-matched rows in our headline DataFrame


        # appends the non-matched rows to our data file


    else:
        api_dataframe.to_csv(file_path, sep='\t', header=list(column_names.values()), mode='a')


headlines = headline_importer()
headlines_dataframe, columns = headline_to_dataframe(headlines)
dataframe_to_file(headlines_dataframe, columns)

   Unnamed: 0          headline_time_created     headline_id  \
0           0  Sat, 25 Jul 2020 15:58:00 GMT  52780948641976   
1           1  Sat, 25 Jul 2020 19:08:00 GMT  52780948590446   
2           2  Sat, 25 Jul 2020 15:52:14 GMT  52780942128508   
3           3  Sat, 25 Jul 2020 22:06:00 GMT  52780944259638   
4           4  Sat, 25 Jul 2020 09:00:00 GMT  52780951100143   

                                       headline_text  headline_language  \
0  Tropical Storm Hanna upgrades to a hurricane, ...                NaN   
1  Florida now has more coronavirus cases than Ne...                NaN   
2  US officials raid Chinese consulate in Houston...                NaN   
3  1 person stabbed as thousands protest in Portl...                NaN   
4  Will Trump’s Abrupt Shift on Coronavirus Re-en...                NaN   

      headline_source                                       headline_url  
0               Chron  https://news.google.com/__i/rss/rd/articles/CB...  
1             

In [3]:
import csv
import os

def headline_to_csv(parsed_headline: list) -> None:
    """
    Takes in a list of parsed fields from a headline entry and saves them to CSV file

    :param parsed_headline: a list of parsed fields from a headline entry
    :return: None
    """
    directory_path = os.path.join('data', 'output')
    file_path = os.path.join(directory_path, 'headlines.csv')

    def duplicate_detector(file_path: str) -> None:
        """
        Searches through headline_id in headline CSV file in order to prevent importing duplicate headlines.

        :param: file_path: file path to the CSV file
        :return: None
        """
        df = pd.read_csv(file_path, sep='\t', header=0)
        # print(df.headline_id)
        # todo: finish the compare tool by importing both sets of data as dataframes
        # and comparing them before writing them to the csv file. maybe turn the
        # csv writers into pandas methods too?

    if os.path.isfile(file_path):   # if CSV file already exists
        with open(file_path, 'a') as f:
            duplicate_detector(file_path)
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(parsed_headline)
    else:                           # if CSV file does not exist
        with open(file_path, 'a') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(['headline_time_created', 'headline_id', 'headline_text',
                 'headline_language', 'headline_source', 'headline_url'])
            writer.writerow(parsed_headline)