# The Headline Collector
---

## Intro
This notebook contains the code for collecting headlines from existing news aggregators.
The purpose is two-fold: to test the reliability of news aggregators in event detection,
and to provide a corpus of headlines on which to perform feature extraction.

In [1]:
from pygooglenews import GoogleNews

def headline_importer() -> dict:

    gn = GoogleNews()

    # retrieves world headlines
    news = gn.topic_headlines('WORLD', proxies=None, scraping_bee=None)

    return news

## Persistence

This prepare to save the data to disk or to database, depending on the desired storage
medium.

### API to DataFrame

This cell converts data from the API call into a pandas DataFrame.

In [2]:
import pandas as pd
import datetime

def headline_to_dataframe(headline_dict: dict):
    """
    Converts data from the Google News API to a pandas DataFrame for analysis.

    :return: pandas DataFrame
    """
    # sets current datetime
    current_datetime = datetime.datetime.utcnow()

    # key/value pairs of column names from API to new data schema
    column_names = {'published': 'headline_time_created', 'id': 'headline_id', 'title': 'headline_text',
                    'title_detail': 'headline_language', 'source': 'headline_source', 'link': 'headline_url',
                    'headline_time_imported': 'headline_time_imported'}

    # create new pandas DataFrame from API dictionary's 'entries' values
    headline_df = pd.DataFrame.from_dict(headline_dict['entries'])

    # keep only necessary columns
    headline_df = headline_df[['published', 'id', 'title', 'title_detail', 'source', 'link']]

    # overrides nested dictionaries in API values with Series object dervied from nested values
    headline_df['title_detail'] = headline_df['title_detail'].apply(pd.Series)['language']
    headline_df['source'] = headline_df['source'].apply(pd.Series)['title']
    headline_df.published = pd.to_datetime(headline_df.published)

    # creates new columns with current datetime
    headline_df['headline_time_imported'] = current_datetime

    # renames columns to match new data schema
    headline_df.rename(columns=column_names, inplace=True)
    headline_df.set_index('headline_id', inplace=True)

    # convert headline_text column into stand-alone Series object
    headline_text_series = headline_df.headline_text

    # converts headline_source to list and adds it to a regex patter
    headline_source_list = headline_df.headline_source.to_list()
    pattern = '|'.join(headline_source_list)

    # replaces the headline_source text added at the end of the headline_text string by the API source
    headline_text_series = headline_text_series.str.replace(pattern, '')
    # removes the last three characters of the headline_text (' - ' after stripping the source name)
    headline_text_series = headline_text_series.str.slice(stop=-3)


    # replaced original headline_text column with newly created Series
    headline_df.headline_text = headline_text_series

    return headline_df

### DataFrame to File

This cell checks to see if data in the DataFrame is already in our data file, and only saves data if it is
not already present.

In [3]:
import os

def dataframe_to_file(api_dataframe) -> None:
    """
    Checks if data in DataFrame is present in target file, and saves DataFrame data that is not already
    present.
    :param api_dataframe: a DataFrame containing headline data
    :return: None
    """
    # imports existing data file, if it already exists: if it doesn't exist, skips to creating file
    directory_path = os.path.join('data', 'output')
    file_path = os.path.join(directory_path, 'headlines.csv')
    if os.path.isfile(file_path):
        try:
            # creates a DataFrame object out of the last 1000 rows in our data file
            existing_dataframe = pd.read_csv(file_path, sep='\t', index_col='headline_id')
        except pd.errors.EmptyDataError:
            print('file found with no data: writing data for first time')
            api_dataframe.to_csv(file_path, sep='\t',
                             header=['headline_time_created', 'headline_text',
                                     'headline_language', 'headline_source', 'headline_url', 'headline_time_imported'],
                             index_label='headline_id', index=True, mode='a')
            exit(0)

        print('appending to existing file and data')
        # sort existing DataFrame by headline_time_created
        existing_dataframe.sort_values(by='headline_time_created', axis=0, ascending=False,
                                       inplace=True)
        # limit to last 1000 rows
        existing_dataframe = existing_dataframe.head(1000)
        # compare the headline_id column in our data file DataFrame to the headline_id column in our
        # incoming headline DataFrame and parse out only non-matched (new) rows
        new_headlines_dataframe = api_dataframe.loc[api_dataframe.index.difference(existing_dataframe.index),]

        # test print statements
        print('number of headlines in CSV\n' + str(existing_dataframe.shape[0]))
        print('number of total headlines in incoming dataframe\n' +
              str(api_dataframe.shape[0]))
        print('number of new headlines in incoming dataframe\n' +
              str(new_headlines_dataframe.shape[0]))
        print('expected number of headlines in CSV after save\n' +
              str(existing_dataframe.shape[0] + new_headlines_dataframe.shape[0]))

        # TODO: keeps adding header to CSV file
        # appends the non-matched rows to our data file
        new_headlines_dataframe.to_csv(file_path, header=False, sep='\t', index_label='headline_id', index=True,
                                       mode='a')

    else:
        print('no file found: writing file for first time')
        api_dataframe.to_csv(file_path, sep='\t',
                             header=['headline_time_created', 'headline_text',
                                     'headline_language', 'headline_source', 'headline_url', 'headline_time_imported'],
                             index_label='headline_id', index=True, mode='a')

In [4]:
# runner
headlines = headline_importer()
headlines_dataframe = headline_to_dataframe(headlines)
dataframe_to_file(headlines_dataframe)


appending to existing file and data
number of headlines in CSV
170
number of total headlines in incoming dataframe
70
number of new headlines in incoming dataframe
7
expected number of headlines in CSV after save
177
