# News Summarizer

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import spacy

from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
from matplotlib import rcParams

sns.set()
%config InlineBackend.figure_format = 'png' 
palette = sns.hls_palette(8, h=.7, s=.9, l=0.5)
sns.palplot(palette)
sns.set_palette(palette)

gray = "444444"

plt.rcParams['figure.facecolor'] = '1' #background color
plt.rcParams['axes.facecolor'] = '1' #plot background color

plt.rcParams['grid.color'] = '0.8'
plt.rcParams['grid.alpha'] = .4

plt.rcParams['axes.edgecolor'] = '1'

plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['grid.linestyle'] = '-'

plt.rcParams['axes.axisbelow'] = True
plt.rcParams['axes.labelcolor'] = gray

plt.rcParams['text.color'] = gray

plt.rcParams['xtick.color'] = gray
plt.rcParams['ytick.color'] = gray
sns.set_style("whitegrid")

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
binary_classes = ['False', 'True']
rcParams['figure.figsize'] = 10, 6


## Load and clean dataset

[All-the-news dataset download link](https://components.one/datasets/all-the-news-articles-dataset/)

In [2]:
cnx = sqlite3.connect('dataset/all-the-news.db')
df = pd.read_sql_query("SELECT * FROM longform", cnx)
print(df.shape)
cnx.close()

df.head()

(204135, 12)


Unnamed: 0,id,title,author,date,content,year,month,publication,category,digital,section,url
0,1,Agent Cooper in Twin Peaks is the audience: on...,\nTasha Robinson\n,2017-05-31,And never more so than in Showtime’s new...,2017,5,Verge,Longform,1.0,,
1,2,"AI, the humanity!",\nSam Byford\n,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017,5,Verge,Longform,1.0,,
2,3,The Viral Machine,\nKaitlyn Tiffany\n,2017-05-25,Super Deluxe built a weird internet empi...,2017,5,Verge,Longform,1.0,,
3,4,How Anker is beating Apple and Samsung at thei...,\nNick Statt\n,2017-05-22,Steven Yang quit his job at Google in th...,2017,5,Verge,Longform,1.0,,
4,5,Tour Black Panther’s reimagined homeland with ...,\nKwame Opam\n,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017,5,Verge,Longform,1.0,,


In [3]:
def clean_df(df):
    df = df.copy().query('category == "newspaper" & publication == "New York Times"')
    df = df.drop(['id', 'digital', 'section', 'url', 'category', 'publication'], axis=1)
    df['author'] = df['author'].str.replace('\n', '')
    
    return df
    
df = clean_df(df).reset_index(drop=True)
print(df.shape)
df.head()

(30257, 6)


Unnamed: 0,title,author,date,content,year,month
0,House Republicans Fret About Winning Their Hea...,Carl Hulse,2016-12-31,WASHINGTON — Congressional Republicans have a ...,2016,12
1,Rift Between Officers and Residents as Killing...,Benjamin Mueller and Al Baker,2017-06-19,"After the bullet shells get counted, the blood...",2017,6
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",Margalit Fox,2017-01-06,"When Walt Disney’s “Bambi” opened in 1942, cri...",2017,1
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",William McDonald,2017-04-10,"Death may be the great equalizer, but it isn’t...",2017,4
4,Kim Jong-un Says North Korea Is Preparing to T...,Choe Sang-Hun,2017-01-02,"SEOUL, South Korea — North Korea’s leader, Kim...",2017,1


In [4]:
df.loc[0, 'content']

'WASHINGTON — Congressional Republicans have a new fear when it comes to their two-year-old health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for low- and moderate-income Americans, handing House Republicans a big victory on separation-of-power issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, an

## Preprocessing

In [None]:
nlp = spacy.load('en_core_web_sm')

def keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct or 
                 t.is_stop or t.like_num))

def lemmatize_doc(doc):
    return [t.lemma_ for t in doc if keep_token(t)]

def preprocess(texts, no_below=1e-3, no_above=.2):
    docs = [lemmatize_doc(nlp(doc)) for doc in texts]
    docs_dict = Dictionary(docs)
    docs_dict.filter_extremes(no_below=no_below, no_above=no_above)
    docs_dict.compactify()
    docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf = model_tfidf[docs_corpus]
    tfidf_matrix = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    
    return pd.DataFrame(tfidf_matrix, columns=docs_dict.token2id.keys())

tfidf_df = preprocess(df['content'])
print(tfidf_df.shape)
tfidf_df.head()