In [156]:
import random

import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2
from itertools import islice
from collections import Counter

from span_clf import read_json_gz_lines
from headline_parser import parse_headline

In [140]:
rows = set()
for row in tqdm(islice(read_json_gz_lines('data/cleaning-titles.json/'), 100000)):
    doc = parse_headline(row['title'])
    spans = tuple([s._.clf_text for s in doc._.spans if s._.clf_text])
    rows.add((spans, row['domain']))

100000it [01:07, 1475.95it/s]


In [141]:
df = pd.DataFrame(list(rows), columns=('spans', 'domain'))

In [142]:
min_count = df.groupby('domain').size().min()
df_sampled = df.groupby('domain').apply(lambda x: x.sample(min_count))

In [143]:
len(df)

99852

In [144]:
len(df_sampled)

22425

In [145]:
X, y = zip(*[(Counter(r.spans), r.domain) for r in df_sampled.itertuples()])

In [146]:
dv = DictVectorizer(sparse=True)

In [147]:
X = dv.fit_transform(X)

In [148]:
feature_names = np.array(dv.get_feature_names())

In [149]:
scores, p = chi2(X, y)

In [150]:
df = pd.DataFrame(list(zip(feature_names, scores, p)), columns=('span', 'chi', 'p'))

In [151]:
df[df.p<0.01].sort_values('chi', ascending=False)

Unnamed: 0,span,chi,p
5313,dailycaller,15652.0,0.0
3504,breitbart,11340.0,0.0
4656,cnn video,4802.0,0.0
20303,the daily caller,2380.0,0.0
12704,listen now,1722.0,0.0
1986,analysis,1561.609375,0.0
24325,video,1328.436975,4.111917e-275
15737,opinion,1296.361446,3.2776720000000004e-268
16275,perspective,1064.0,2.873356e-218
20601,the latest,767.767442,8.631918999999999e-155


In [155]:
with open('blocklist.txt', 'w') as fh:
    for span in df[df.p<0.01].sort_values('chi', ascending=False).span:
        print(span, file=fh)