### Code to Extract Title, Description, and URL from API

In [1]:
%%time

import os
import requests
import pandas as pd
import numpy as np
import nltk
import datetime
from nltk.tag import StanfordNERTagger


# import sys # import 
# !{sys.executable} -m pip install polyglot
# nltk.download('words')

Wall time: 1.74 s


In [2]:
news = [
        'bloomberg', 'reuters', 'financial-times',
        'the-economist', 'the-wall-street-journal', 'cnbc',
        ]

In [3]:
%%time

news_d = {}

for i in news:
    
    """
    Reads in the API request of the top headlines to clean the JSON data to only return the source, title, and description
    """
    
    url = ('https://newsapi.org/v2/top-headlines?'
           'sources=' + i + '&'
           'apiKey=ddcb36b8f0d645b0acd4a54ef804a9fe')
    
    response = requests.get(url)
    
    news_d[i] = []
    
    for x in range(0, len(response.json()['articles'])):
        article_title = response.json()['articles'][x]['title']
        article_description = response.json()['articles'][x]['description']
        
        news_d[i].append({'title': article_title, 'description': article_description})

Wall time: 44.5 s


In [4]:
%%time

def create_d(dictionary):
    source_list = [] 
    title_list = []
    description_list = []
    
    """
    Takes in the cleaned news dictionary to turn it into an acceptable format for pandas.
    1. Check if there are any news articles for that selected news site.
    2. Appends the source (e.g. bloomberg) then the article's title and description to the above list
    """

    for i in news_d:
        if bool(news_d[i]):
            
            for x in range(0, len(news_d[i])):
                source_list.append(i)
                title_list.append(news_d[i][x]['title'])
                description_list.append(news_d[i][x]['description'])
    
    d = {'source': source_list,
         'title': title_list,
         'description': description_list,
        }
    
    return d

Wall time: 0 ns


In [5]:
%%time

data = create_d(news_d)
df = pd.DataFrame(data=data)
df = df[['source', 'title', 'description']]

Wall time: 1.98 ms


In [6]:
df

Unnamed: 0,source,title,description
0,bloomberg,Trump Making Supreme Court Pick to Cement Cons...,Donald Trump is poised to continue his remake ...
1,bloomberg,Russia Should Admit It Uses Mercenaries,The soldiers of fortune are no longer keeping ...
2,bloomberg,Here's One Area Where Russia Beats the U.S.,By several key measures it has the world's bes...
3,bloomberg,World Bank CEO Adds to Voices of Worry Over Gl...,Global debt is becoming a bigger worry as the ...
4,bloomberg,JPMorgan Maps Out Currencies to Buy If a Reces...,"If a U.S. or global recession is looming, it’s..."
5,bloomberg,North Korea Reminds Trump Its Nuclear Weapons ...,U.S. Secretary of State Mike Pompeo went to Py...
6,bloomberg,"Canada’s Labor Force Grows Most in 6 Years, Ra...",Canada’s tight jobs market is beginning to bri...
7,bloomberg,What’s Next for Hotelier After Buying the Trum...,"InnVest Hotels LP, which added the former Toro..."
8,bloomberg,Trudeau Says Nothing ‘Untoward’ Led to Past Gr...,
9,bloomberg,"Canada Strikes Back at Trump, and Condo Buyers...",Condo buyers in Canada’s already pricey market...


## AI Attempt

Reference to the following blog:
http://blog.chartbeat.com/2015/10/22/identifying-and-clustering-news-events-by-content/

And also reference to Jose Portilla's Natural Language Processing course in his Python for Data Science and Machine Learning Bootcamp Course on Udemy:
https://www.udemy.com/python-for-data-science-and-machine-learning-bootcamp/

In [7]:
%%time

import string
from nltk.corpus import stopwords

Wall time: 0 ns


In [8]:
%%time

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    
    # Check if the cell has characters inside
    if mess is not None:

        # Check characters to see if they are in punctuation
        nohyph = mess.replace('-', ' ')
        nopunc = nohyph.replace("’s", ' ')
        nopunc = nopunc.replace("’", ' ')
        nopunc = nopunc.replace("‘", ' ')
        nopunc = [char for char in nopunc if char not in string.punctuation]
        


        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)

        # Now just remove any stopwords
        word_list = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
        
        return ' '.join(word_list)

Wall time: 0 ns


## TF-IDF

In [9]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Wall time: 0 ns


In [10]:
%%time

df['details'] = df['title'].astype(str) + ' ' + df['description'].astype(str)
df['details'] = df['details'].apply(text_process)

Wall time: 870 ms


In [11]:
%%time

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['details'])

Wall time: 13 ms


In [12]:
%%time

sim = cosine_similarity(X)
sim_matrix = pd.DataFrame(sim, columns = list(df['title']), index = list(df['title']))
sim_matrix[sim_matrix >= 0.18] = 1
sim_matrix[sim_matrix < 0.18] = 0
sim_matrix = sim_matrix.reset_index()

Wall time: 7.99 ms


In [13]:
%%time

clusters = []

while len(sim_matrix) > 0:
    summation = list(sim_matrix.sum(axis=1))
    ind = summation.index(max(summation))

    titles = list(sim_matrix.columns[(sim_matrix == 1).iloc[ind]])
    index_title = sim_matrix['index'][ind]
    titles.append(index_title)
    titles = list(set(titles))
    clusters.append(titles)

    sim_matrix = sim_matrix.drop(titles, axis=1)
    for title in titles:
        sim_matrix = sim_matrix[sim_matrix['index'] != title]

    sim_matrix = sim_matrix.reset_index(drop=True)

Wall time: 202 ms


In [14]:
clusters

[["Denouncing 'dangerous strategy', Brexit minister quits in blow to May",
  'U.K. Minister in Charge of Brexit Negotiations Resigns',
  "Factbox: How could Britain's Prime Minister Theresa May be removed from office?",
  'UK PM May should not face a leadership challenge: ex-Brexit minister Davis',
  'Britain’s new Brexit plan is savaged—by its own Brexit secretary',
  'UK Brexit minister Braverman has not resigned in protest: official',
  "UK's ex-Brexit minister Davis says not calling for other resignations",
  'UK government in disarray as Davis resigns as Brexit secretary',
  'The UK’s Brexit secretary just walked away from the job. Here’s what that means'],
 ['Trump Making Supreme Court Pick to Cement Conservative Majority',
  "Democrats are tying Trump's Supreme Court pick to the special counsel's Russia probe. Here's why",
  'Trump Takes a Final Look at Supreme Court Choices',
  'A court with a solid conservative majority could reshape American life'],
 ["After Pyongyang put-dow

In [17]:
import csv
import time

current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv('data/{}_input.csv'.format(current_time), sep=',', encoding='utf-8', index=False)

clusters_df = pd.DataFrame(clusters, columns = range(1, len(clusters[0])+1))
clusters_df.to_csv('data/{}_output.csv'.format(current_time), sep=',', encoding='utf-8', index=False)