In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.parsing.preprocessing import *

In [2]:
df = pd.read_csv('data/Full-Economic-News-DFE-839861.csv', encoding='ISO-8859-1')

In [3]:
def remove_tag(s):
    return s.replace('</br>', ' ')
def preprocess(s):
    return list(map(lambda x: ' '.join(preprocess_string(x)), sent_tokenize(remove_tag(s))))

corpus = df.text.apply(preprocess).explode().reset_index(drop=True)

In [4]:
corpus

0        new york yield certif deposit offer major bank...
1        small denomin consum cd sold directli bank ave...
2        month consum deposit averag yield sank week ac...
3        bank banxquot survei citibank new york corest ...
4        declin somewhat smaller year consum cd eas ban...
                               ...                        
80262    compani said call dai shutdown schedul plant week
80263    new feed onûó strong feel broker investor pri...
80264                                                point
80265    session dow climb level technician consid big ...
80266    despit strong tone ralli earli sessionûó dow ...
Name: text, Length: 80267, dtype: object

In [5]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(corpus)
X.shape

(80267, 474198)

In [6]:
svd = TruncatedSVD(n_components=100, random_state=42)
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
             random_state=42, tol=0.0)

In [7]:
sample = df.text[1]
s = sent_tokenize(remove_tag(sample))
U = svd.transform(vectorizer.transform(preprocess(sample)))
N = len(U)

In [8]:
word_counter = np.vectorize(lambda x: len(word_tokenize(x))) # TODO: word_tokenize keeps punctuation as separate words, fix it
wc = word_counter(s)

In [9]:
def mask(N, idx):
    mask = np.ones(N, dtype=bool)
    mask[idx] = False
    return mask

R = U
S, B = set(), np.ndarray(shape=(0,U.shape[1]))
idx = []

c = R.sum(axis=0) / N

p = np.argmax(np.linalg.norm(R - c, axis=1))
S.add(s[p])
idx.append(p)

q = np.argmax(np.linalg.norm(R - R[p], axis=1))
S.add(s[q])
idx.append(q)

b_0 = R[q] / np.linalg.norm(R[q])
B = np.append(B, b_0.reshape(1,-1), axis=0)
total_length = wc[p] + wc[q]

L = 100
dist = lambda u, B: np.linalg.norm(u - (u @ B.T) @ B)
for i in range(N - 2):
    r = np.argmax(np.apply_along_axis(dist, 1, R[mask(N, idx)], B))
    if total_length + wc[r] > L:
        break
    if s[r] not in S:
        S.add(s[r])
        idx.append(r)

        b_r = R[r] / np.linalg.norm(R[r])
        B = np.append(B, b_r.reshape(1,-1), axis=0)
        total_length += wc[r]

idx, S

([0, 3],
 {"The State Children's Health Insurance Program was created in 1997 to help children whose families couldn't afford insurance but didn't qualify for Medicaid, and administration officials tell the New York Times that the changes are aimed at returning the program to its low- income focus and assuring it didn't become a replacement for private insurance.",
  "The Wall Street Journal Online  The Morning Brief, a look at the day's biggest news, is emailed to subscribers by 7 a.m. every business day."})