### Code to Extract Title, Description, and URL from API

In [None]:
import os
import requests
import pandas as pd
import datetime
import string

from nltk.tag import StanfordNERTagger
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# import sys # import 
# !{sys.executable} -m pip install polyglot
# nltk.download('words')

In [None]:
news = [
        'bloomberg', 'reuters', 'financial-times',
        'the-economist', 'the-wall-street-journal', 'cnbc',
        'financial-post', 'business-insider', 'google-news', 
        'fortune', 'bbc-news',
        ]

In [None]:
%%time

news_d = {}

for i in news:
    
    """
    Reads in the API request of the top headlines to clean the JSON data to only return the source, title, and description
    """
    
    url = ('https://newsapi.org/v2/top-headlines?'
           'sources=' + i + '&'
           'apiKey=ddcb36b8f0d645b0acd4a54ef804a9fe')
    
    response = requests.get(url)
    
    news_d[i] = []
    
    for x in range(0, len(response.json()['articles'])):
        article_title = response.json()['articles'][x]['title']
        article_description = response.json()['articles'][x]['description']
        
        news_d[i].append({'title': article_title, 'description': article_description})

In [None]:
def create_d(dictionary):
    source_list = [] 
    title_list = []
    description_list = []
    
    """
    Takes in the cleaned news dictionary to turn it into an acceptable format for pandas.
    1. Check if there are any news articles for that selected news site.
    2. Appends the source (e.g. bloomberg) then the article's title and description to the above list
    """

    for i in news_d:
        if bool(news_d[i]):
            
            for x in range(0, len(news_d[i])):
                source_list.append(i)
                title_list.append(news_d[i][x]['title'])
                description_list.append(news_d[i][x]['description'])
    
    d = {'source': source_list,
         'title': title_list,
         'description': description_list,
        }
    
    return d

In [None]:
data = create_d(news_d)
df = pd.DataFrame(data=data)
df = df[['source', 'title', 'description']]

## AI Attempt

Reference to the following blog:
http://blog.chartbeat.com/2015/10/22/identifying-and-clustering-news-events-by-content/

And also reference to Jose Portilla's Natural Language Processing course in his Python for Data Science and Machine Learning Bootcamp Course on Udemy:
https://www.udemy.com/python-for-data-science-and-machine-learning-bootcamp/

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    
    # Check if the cell has characters inside
    if mess is not None:

        # Check characters to see if they are in punctuation
        nohyph = mess.replace('-', ' ')
        nopunc = nohyph.replace("’s", ' ')
        nopunc = nopunc.replace("’", ' ')
        nopunc = nopunc.replace("‘", ' ')
        nopunc = [char for char in nopunc if char not in string.punctuation]
        


        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)

        # Now just remove any stopwords
        word_list = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
        
        return ' '.join(word_list)

## TF-IDF

In [None]:
df['details'] = df['title'].astype(str) + ' ' + df['description'].astype(str)
# df['details'] = df['title'].astype(str)
df['details'] = df['details'].apply(text_process)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['details'])

## Cosine Similarity Matrix

Referencing to method in this blog post: https://ematosevic.wordpress.com/2016/08/21/clustering-data-with-similarity-matrix-in-python-tutorial/

In [None]:
parameter = 0.2

sim = cosine_similarity(X)
sim_matrix = pd.DataFrame(sim, columns = list(df['title']), index = list(df['title']))
sim_matrix[sim_matrix >= parameter] = 1
sim_matrix[sim_matrix < parameter] = 0
sim_matrix = sim_matrix.reset_index()

In [None]:
clusters = []

while len(sim_matrix) > 0:
    summation = list(sim_matrix.sum(axis=1))
    ind = summation.index(max(summation))

    titles = list(sim_matrix.columns[(sim_matrix == 1).iloc[ind]])
    index_title = sim_matrix['index'][ind]
    titles.append(index_title)
    titles = list(set(titles))
    clusters.append(titles)

    sim_matrix = sim_matrix.drop(titles, axis=1)
    for title in titles:
        sim_matrix = sim_matrix[sim_matrix['index'] != title]

    sim_matrix = sim_matrix.reset_index(drop=True)

In [None]:
for cluster in clusters:
    if len(cluster) > 1:
        for i in cluster:
            print(i)
        print('=====================')

In [None]:
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv('data/{}_input-parameter-('.format(current_time) + str(parameter) + ').csv', sep=',', encoding='utf-8', index=False)

clusters_df = pd.DataFrame(clusters, columns = range(1, len(clusters[0])+1))
clusters_df.to_csv('data/{}_output-parameter-('.format(current_time) + str(parameter) + ').csv', sep=',', encoding='utf-8', index=False)