# Text Summary

In [1]:
#
# We demonstrate how to use gensim to summarize text.
# We study the dataset of BBC text from kaggle which 
# can be found as
# https://www.kaggle.com/yufengdev/bbc-fulltext-and-category
#
# We compute the sentiment score of the text and its 
# summary. There are differences between them.
#

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("bbc-text.csv")

In [4]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
from gensim.summarization import summarize



In [6]:
def summary(text,ratio=0.5):
    return summarize(text,ratio)

In [7]:
%time df['Summary'] = df['text'].apply(summary)

Wall time: 12.1 s


In [8]:
df.head()

Unnamed: 0,category,text,Summary
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...


In [9]:
# Remove Stop words

In [10]:
from nltk.corpus import stopwords

In [11]:
stop_words = stopwords.words('english')

In [12]:
# Cleaned text for original text
%%time
df['Clean1'] = df['text'].str.replace(r'(\W|\d)',' ')
df['Clean1'] = df['Clean1'].str.replace(r'\b(' + r'|'.join(stop_words) + r')\b\s*',' ')

Wall time: 1.26 s


In [13]:
# Cleaned text for summary text
%%time 
df['Clean2'] = df['Summary'].str.replace(r'(\W|\d)',' ')
df['Clean2'] = df['Clean2'].str.replace(r'\b(' + r'|'.join(stop_words) + r')\b\s*',' ')

Wall time: 726 ms


In [14]:
df.head()

Unnamed: 0,category,text,Summary,Clean1,Clean2
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,tv future hands viewers home theatre syste...,tv future hands viewers home theatre syste...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,tigers wary farrell gamble leicester say ...,tigers wary farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership sid...,yeading face newcastle fa cup premiership sid...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,ocean twelve raids box office ocean twelve ...,ocean twelve raids box office ocean twelve ...


# Sentiment Analysis

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [16]:
def SentimentScore(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores['compound']

In [17]:
%time df['Sentiment1'] = df['Clean1'].apply(SentimentScore)

Wall time: 18.7 s


In [18]:
%time df['Sentiment2'] = df['Clean2'].apply(SentimentScore)

Wall time: 16.6 s


In [19]:
df.head()

Unnamed: 0,category,text,Summary,Clean1,Clean2,Sentiment1,Sentiment2
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,tv future hands viewers home theatre syste...,tv future hands viewers home theatre syste...,0.9914,0.9716
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,-0.9571,-0.9618
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,tigers wary farrell gamble leicester say ...,tigers wary farrell gamble leicester say ...,0.9531,0.9485
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership sid...,yeading face newcastle fa cup premiership sid...,0.9607,0.8885
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,ocean twelve raids box office ocean twelve ...,ocean twelve raids box office ocean twelve ...,0.9486,0.6249


In [20]:
df['Diff Sentiment'] = df['Sentiment1'] - df['Sentiment2']

In [21]:
df['Diff Sentiment'].describe()

count    2225.000000
mean        0.060889
std         0.438725
min        -1.820700
25%        -0.010800
50%         0.016300
75%         0.106300
max         1.816700
Name: Diff Sentiment, dtype: float64