In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
#load data
df = pd.read_table('data/final/event_panama_papers_data.txt', sep='\t', encoding='utf-8', header=0, low_memory=False)
df.shape

(2295892, 37)

# Automatic Summarization
## Step 1
Narrow the corpus down to a selection pool of reasonable size for summary generation. The approaches used as:
1. Random Selection
2. TF-IDF Rank
3. Clustering
4. RQI Method (Rel., Qua., Inf.)

Issues in 1-3
* irrelevant documents
* uninformative documents
* low quality documents

## Step 2
Perform state of the art temporal-novelty selection on each selection pool

1. TF-IDF Centroid Distance
2. K-Terms

## Evaluate

## 1. Random Selection
### method:
Select k random documents from corpus.

In [11]:
summ = df[df['is_retweet'] == False].sample(25)
summ = summ.sort_values(by='created_at', ascending=True)

for _,doc in summ.iterrows():
    print(doc['created_at'] + '\t' + doc['text'])
    print()

2016-04-04 00:04:21	Panama Papers: Document leak exposes global corruption, secrets of the rich via /r/worldnews https://t.co/qCnX5BDF1v

2016-04-04 00:19:59	IMF tells Greece leak is 'nonsense': International Monetary Fund chief Christine Lagarde dismisses reports tha... https://t.co/Jy9LRO313K

2016-04-04 07:03:57	A privilege is a hesitant continuance up to spring a leak your devotion: hrcwbJt

2016-04-04 13:30:51	Panama Papers: BJP hiding something, we want time-bound inquiry, says Congress https://t.co/WWNQdIubAd https://t.co/qr3m7yBp5g

2016-04-04 13:51:24	The names in the Panama Papers https://t.co/MDRbdBqK9f #PanamaLeaks

2016-04-05 00:14:32	What You Should Know About The Panama Papers via Digg https://t.co/jbp2Zqfw71 #News

2016-04-05 05:28:07	The names in the Panama Papers https://t.co/Vtavd00CXU https://t.co/rIuXcbQOUF

2016-04-05 10:56:28	Panama Papers: Barcelona stand by star player Lionel Messi https://t.co/7EbDe1mSzw https://t.co/U47jmwQ4VR

2016-04-05 13:34:55	Embarrassin

## 2. TF-IDF Rank
### method:
Computes weights for each document using term frequencies and selects k best documents. The term frquency inverse document frequency equation computes a term importance which is higher if the term is frequent in one document but rare in the total corpus.

In [46]:
df_bow = df[(df['query'] == 'panama papers') & (df['is_retweet'] == False) & (df['text_clean'].notnull())][['created_at', 'text']]

In [47]:
#initialize sklearn vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
#tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=.01, max_features=400, stop_words='english')

#create matrix of tfidf counts
Xtfidf = tfidf_vectorizer.fit_transform(df_bow['text'])

In [48]:
#get mean tfidf for each doc
Xtfidf_means = Xtfidf.mean(axis=1)
df_tfidf_means = pd.DataFrame(Xtfidf_means, columns=['tfidf_mean'])
#get sum tfidf for each doc
Xtfidf_sums = Xtfidf.sum(axis=1)
df_tfidf_sums = pd.DataFrame(Xtfidf_sums, columns=['tfidf_sum'])

df_tfidf_stats = pd.concat([df_tfidf_means, df_tfidf_sums], axis=1)

In [49]:
df_bow = pd.merge(df_bow, df_tfidf_stats, how='inner', left_index=True, right_index=True)

In [60]:
summ = df_bow.sort_values(by='tfidf_mean', ascending=False).head(25)
summ = summ.sort_values(by='created_at', ascending=True)

for _,doc in summ.iterrows():
    print(doc['created_at'] + '\t' + doc['text'])
    print()

2016-04-03 18:12:14	Giant Leak of Offshore Financial Records Exposes Global Array of Crime and Corruption #panamapapers https://t.co/9NkfUBd8a7

2016-04-03 18:26:04	This has got to stop. The Panama Papers: how the world’s rich and famous hide their money offshore https://t.co/Vl2imHqbj4

2016-04-03 21:55:56	The Panama Papers: what you need to know https://t.co/caiKg2pM0X

2016-04-03 22:41:12	Top story:Immigration detainees four tim Revealed: the $2bn offshore trail that… https://t.co/0f5NnGCGzB, see more https://t.co/pa7FfekGFk

2016-04-03 23:06:09	Papers leak exposes how Putin, Xi's friends hide money https://t.co/i3hRgNEqCB Is @TurnbullMalcolm of Cayman Is 'fame' part of this? #auspol

2016-04-03 23:11:18	Guess that's "Gotcha" for criminals Leaked documents reveal offshore accounts of the wealthy and powerful-report say https://t.co/FhWVeDW7Nb

2016-04-04 00:12:52	Here check the Pakistani looters: #NawazSharif and Maryam Nawaz's wealth leaked by #Panamapapers #Panamaleaks https://t.c

## 3. Cluster Selection
### method:
Cluster documents using tf-idf distances, select k docs / k clusters from each cluster, for an evenly distributed information pool.