In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Installing sumy library for implementing the LSA model.
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

#Ignore Future Warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Reading in the consolidated data
data = pd.read_pickle("/Users/meghnatiwari/Desktop/255_Group_Project/Myparteamproject/Text-classification-and-summarization/Data_collection/news_data.zip")
print(data.shape)
dataset = pd.DataFrame(data)

(115000, 3)


In [3]:
# Drop lines that contain the Null Values
dataset = dataset.dropna()
dataset.shape
dataset.head()

Unnamed: 0,URL,CATEGORY,content and summary
50890,http://blogs.wsj.com/moneybeat/2014/03/26/fed-...,business,[The Federal Reserve approved Ally Financial I...
50891,http://cumberlink.com/news/national/duke-share...,business,[]
50892,http://www.bizjournals.com/charlotte/blog/ener...,business,[]
50893,http://www.ky3.com/news/local/duke-energy-shar...,business,[]
50894,http://www.chem.info/news/2014/03/regulators-m...,business,[]


In [4]:
# Splitting the content and summary into 2 fields
dataset = dataset.loc[:,['CATEGORY','content and summary']]

In [5]:
dataset = dataset[dataset['content and summary'].map(lambda d: len(d)) > 0]

In [6]:
dataset.shape

(37033, 2)

In [7]:
def get_content(row):
    row = row[0]
    return row
def get_summary(row):
    row = row[1]
    return row

In [8]:
dataset['content'] = dataset['content and summary'].apply(get_content)
dataset['summary'] = dataset['content and summary'].apply(get_summary)

In [9]:
dataset.head()

Unnamed: 0,CATEGORY,content and summary,content,summary
50890,business,[The Federal Reserve approved Ally Financial I...,The Federal Reserve approved Ally Financial In...,The Federal Reserve approved Ally Financial In...
50898,business,[— Major shareholders of Duke Energy Corp. hav...,— Major shareholders of Duke Energy Corp. have...,— Major shareholders of Duke Energy Corp. have...
50900,business,[Photos taken earlier this month show that Nor...,Photos taken earlier this month show that Nort...,Photos taken earlier this month show that Nort...
50903,business,[Thanks to dogged reporting by the Associated ...,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...
50906,business,[The energy giant says it is committed to clea...,The energy giant says it is committed to clean...,The energy giant says it is committed to clean...


In [10]:
dataset.drop(['content and summary','summary'],axis=1, inplace=True)

In [11]:
dataset.head()

Unnamed: 0,CATEGORY,content
50890,business,The Federal Reserve approved Ally Financial In...
50898,business,— Major shareholders of Duke Energy Corp. have...
50900,business,Photos taken earlier this month show that Nort...
50903,business,Thanks to dogged reporting by the Associated P...
50906,business,The energy giant says it is committed to clean...


In [12]:
dataset = dataset.rename(columns= {'CATEGORY':'category'})

In [13]:
dataset.head()

Unnamed: 0,category,content
50890,business,The Federal Reserve approved Ally Financial In...
50898,business,— Major shareholders of Duke Energy Corp. have...
50900,business,Photos taken earlier this month show that Nort...
50903,business,Thanks to dogged reporting by the Associated P...
50906,business,The energy giant says it is committed to clean...


In [15]:
# Pre processing the data like removing stop words and white spaces.
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
#            gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.remove_stopwords ,
           gsp.strip_short,
           gsp.stem_text
          ]

def clean_text(dataset):
    dataset = dataset.lower()
    dataset = utils.to_unicode(dataset)
    for f in filters:
        dataset = f(dataset)
    return dataset

In [16]:
dataset["clean_text"] = dataset["content"].apply(clean_text)
dataset.head()

Unnamed: 0,category,content,clean_text
50890,business,The Federal Reserve approved Ally Financial In...,feder reserv approv alli financi inc.’ capit p...
50898,business,— Major shareholders of Duke Energy Corp. have...,major sharehold duke energi corp. call company...
50900,business,Photos taken earlier this month show that Nort...,photo taken earlier month north carolina regul...
50903,business,Thanks to dogged reporting by the Associated P...,"thank dog report associ press, know activ duke..."
50906,business,The energy giant says it is committed to clean...,energi giant sai commit clean dan river spill ...


In [17]:
# label_encoder = LabelEncoder() 
# dataset['category']= label_encoder.fit_transform(dataset['category'])
# dataset.sample(5)

Summarization using gensim 

In [18]:
dataset["Summary"] = ""
dataset.head()

Unnamed: 0,category,content,clean_text,Summary
50890,business,The Federal Reserve approved Ally Financial In...,feder reserv approv alli financi inc.’ capit p...,
50898,business,— Major shareholders of Duke Energy Corp. have...,major sharehold duke energi corp. call company...,
50900,business,Photos taken earlier this month show that Nort...,photo taken earlier month north carolina regul...,
50903,business,Thanks to dogged reporting by the Associated P...,"thank dog report associ press, know activ duke...",
50906,business,The energy giant says it is committed to clean...,energi giant sai commit clean dan river spill ...,


In [19]:
pd.options.mode.chained_assignment = None
for i,j in dataset.iterrows():
    parser = PlaintextParser.from_string(j.clean_text,Tokenizer("english"))
    summarizer_lsa = LsaSummarizer()
    summary2 = summarizer_lsa(parser.document,5)
    for sentence in summary2:
        dataset.Summary[i] = sentence

In [20]:
# The Field Summary contains the 5 line summary of each content
dataset.head()

Unnamed: 0,category,content,clean_text,Summary
50890,business,The Federal Reserve approved Ally Financial In...,feder reserv approv alli financi inc.’ capit p...,ally’ plan approv feder reserv bank lend sever...
50898,business,— Major shareholders of Duke Energy Corp. have...,major sharehold duke energi corp. call company...,want list recommend ensur compani complianc cu...
50900,business,Photos taken earlier this month show that Nort...,photo taken earlier month north carolina regul...,public simpli longer trust denr compet investi...
50903,business,Thanks to dogged reporting by the Associated P...,"thank dog report associ press, know activ duke...",tar heel ought justifi outrag lobbyist allow p...
50906,business,The energy giant says it is committed to clean...,energi giant sai commit clean dan river spill ...,"""if issu need addressed, we’ll care immediatel..."
