In [1]:
import datetime
import json
import pickle
from pathlib import Path

import dateparser
import pandas as pd
from cleantext import clean
from german_lemmatizer import lemmatize
from joblib import Parallel, delayed
from somajo import SentenceSplitter, Tokenizer
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from german import preprocess

# combine files

In [2]:
data = []
for f in Path("/mnt/data2/ptf/final_zo").glob("*.json"):
    with open(f) as inp:
        for line in tqdm(inp):
            d = json.loads(line)
            if "comments" in d:
                for c in d["comments"]:
                    data.append({"url": d["url"], **c})

3328199it [00:38, 86966.48it/s]
3096it [00:00, 72439.03it/s]
1344582it [00:37, 35984.74it/s]


In [3]:
len(data)

15384667

In [4]:
df = pd.DataFrame(data)

# parse relative dates

In [5]:
def parse(x):
    # date when crawled 11th June 2019
    d = datetime.datetime(2019, 6, 11, 12, 0)
    idx = x.find('—')
    return dateparser.parse(x[idx:], languages=['de'], settings={'RELATIVE_BASE': d})

In [6]:
parsed = Parallel(n_jobs=4)(delayed(parse)(i) for i in tqdm(df['date'].values))

100%|██████████| 15384667/15384667 [3:10:19<00:00, 1347.22it/s]


In [8]:
df['date'] = parsed

In [10]:
df.to_pickle('parsed_data.pkl')

In [2]:
df = pd.read_pickle('parsed_data.pkl')

# group into chunks

In [3]:
df = df.sort_values(by='date', ascending=False)

In [4]:
df.shape

(15384667, 3)

In [5]:
df = df.drop_duplicates(subset=['text', 'date'])

In [6]:
df

Unnamed: 0,date,text,url
2145505,2019-06-11 11:59:50,"""Does a form of capitalism that generates incr...",https://www.zeit.de/wirtschaft/2019-05/kapital...
5176690,2019-06-11 11:59:49,Seehofer hat wiedermal einen Seehofer-Witz gem...,https://www.zeit.de/politik/deutschland/2019-0...
2368511,2019-06-11 11:59:48,"""In Deutschland wandern rund vier Tonnen Leben...",https://www.zeit.de/wirtschaft/2019-06/muellve...
6368951,2019-06-11 11:59:44,Vom Niveau nur knapp über dem Champions League...,https://www.zeit.de/sport/2019-06/uefa-nations...
2010560,2019-06-11 11:59:32,Mooomennt da kommt mir eine Idee...\nBäume mac...,https://www.zeit.de/wirtschaft/2019-06/klimasc...
1488465,2019-06-11 11:59:28,"Verständlich, dass sich Medien nach fast 20 Ja...",https://www.zeit.de/politik/deutschland/2019-0...
2368365,2019-06-11 11:59:11,So schlimm ist es nicht.\nAber leider orientie...,https://www.zeit.de/politik/deutschland/2019-0...
5765230,2019-06-11 11:59:07,Dieses Täuschungsmanöver..\nwäre bei keinem Ha...,https://www.zeit.de/sport/2019-06/sc-paderborn...
5367797,2019-06-11 11:59:05,"Der Kommentar, auf den Sie Bezug nehmen, wurde...",https://www.zeit.de/sport/2019-06/deutsche-fus...
2368643,2019-06-11 11:59:02,Mein Wattebausch kann sogar etwas aufnehmen un...,https://www.zeit.de/politik/ausland/2019-06/ge...


In [7]:
groups = pd.qcut(df['date'], 10)

In [8]:
groups

2145505           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5176690           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2368511           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
6368951           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2010560           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
1488465           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2368365           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5765230           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5367797           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2368643           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5367798           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2368611           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5709924           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2010174           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5862563           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
2145687           (2019-01-11 12:00:00, 2019-06-11 11:59:50]
5456641           (2019-

In [9]:
df['year'] = df['date'].apply(lambda x: x.year)

In [10]:
df['year'].value_counts()

2018    3257309
2017    2367352
2016    1680063
2019    1333176
2015    1064534
2014     751005
2013     494623
2012     481302
2011     427243
2010     226579
2009      79962
2008      22065
2006       3493
2007       1974
2005        102
Name: year, dtype: int64

In [11]:
df['group'] = df['year'].apply(lambda x: x if x > 2010 else 2010)

In [12]:
df['group'].value_counts()

2018    3257309
2017    2367352
2016    1680063
2019    1333176
2015    1064534
2014     751005
2013     494623
2012     481302
2011     427243
2010     334175
Name: group, dtype: int64

# clean, split into sentences

In [13]:
# def get_sents(texts):
#     tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
#     sentence_splitter = SentenceSplitter(is_tuple=False)
    
#     results = []
#     for text in texts:
#         text = clean(text, lang='de', lower=False)
#         tokens = tokenizer.tokenize_paragraph(text)
#         sentences = sentence_splitter.split(tokens)
#         cleaned = [' '.join(s) for s in sentences]
#         results.append(cleaned)
#     return results

In [14]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [15]:
def combine(li):
    for l in li:
        for x in l:
            yield x

In [16]:
# results = Parallel(n_jobs=4)(delayed(get_sents)(row) for row in tqdm(list(chunks(df['text'], 10000))))

In [17]:
# pickle.dump( results, open( "/mnt/data2/results_sentes.pkl", "wb" ) )

In [18]:
# results = pickle.load( open( "/mnt/data2/results_sentes.pkl", "rb" ) )

In [19]:
# df['sents'] = list(combine(results))

In [20]:
sents_data = []

In [21]:
# for _, row in tqdm(df[['group', 'sents']].iterrows(), total=df.shape[0]):
#     for s in row['sents']:
#         sents_data.append({'text': s, 'group': row['group']})

In [22]:
len(sents_data)

0

In [23]:
# df_sents = pd.DataFrame(sents_data)

In [24]:
# df_sents

In [25]:
df

Unnamed: 0,date,text,url,year,group
2145505,2019-06-11 11:59:50,"""Does a form of capitalism that generates incr...",https://www.zeit.de/wirtschaft/2019-05/kapital...,2019,2019
5176690,2019-06-11 11:59:49,Seehofer hat wiedermal einen Seehofer-Witz gem...,https://www.zeit.de/politik/deutschland/2019-0...,2019,2019
2368511,2019-06-11 11:59:48,"""In Deutschland wandern rund vier Tonnen Leben...",https://www.zeit.de/wirtschaft/2019-06/muellve...,2019,2019
6368951,2019-06-11 11:59:44,Vom Niveau nur knapp über dem Champions League...,https://www.zeit.de/sport/2019-06/uefa-nations...,2019,2019
2010560,2019-06-11 11:59:32,Mooomennt da kommt mir eine Idee...\nBäume mac...,https://www.zeit.de/wirtschaft/2019-06/klimasc...,2019,2019
1488465,2019-06-11 11:59:28,"Verständlich, dass sich Medien nach fast 20 Ja...",https://www.zeit.de/politik/deutschland/2019-0...,2019,2019
2368365,2019-06-11 11:59:11,So schlimm ist es nicht.\nAber leider orientie...,https://www.zeit.de/politik/deutschland/2019-0...,2019,2019
5765230,2019-06-11 11:59:07,Dieses Täuschungsmanöver..\nwäre bei keinem Ha...,https://www.zeit.de/sport/2019-06/sc-paderborn...,2019,2019
5367797,2019-06-11 11:59:05,"Der Kommentar, auf den Sie Bezug nehmen, wurde...",https://www.zeit.de/sport/2019-06/deutsche-fus...,2019,2019
2368643,2019-06-11 11:59:02,Mein Wattebausch kann sogar etwas aufnehmen un...,https://www.zeit.de/politik/ausland/2019-06/ge...,2019,2019


In [45]:
# df_sents.to_pickle('/mnt/data2/results_sents.pkl')

In [2]:
# df_sents = pd.read_pickle('/mnt/data2/results_sents.pkl')

# Lemmatize

In [26]:
df_sents = df
del df

In [27]:
df_sents = df_sents[df_sents['text'].str.len() > 10]

In [28]:
df_sents.shape

(12147166, 5)

In [29]:
final = preprocess(df_sents['text'].values)

100%|██████████| 12147166/12147166 [56:14<00:00, 3599.88it/s]
0it [00:00, ?it/s]
  0%|          | 0/1215 [00:00<?, ?it/s][A
  0%|          | 1/1215 [00:01<21:42,  1.07s/it][A
  1%|          | 8/1215 [00:41<49:34,  2.46s/it][A
  1%|          | 9/1215 [00:41<38:19,  1.91s/it][A
  1%|          | 10/1215 [00:42<29:55,  1.49s/it][A
  1%|          | 12/1215 [01:18<2:09:43,  6.47s/it][A
  1%|          | 13/1215 [01:18<1:31:37,  4.57s/it][A
  1%|          | 14/1215 [01:19<1:08:54,  3.44s/it][A
  1%|          | 15/1215 [01:20<56:13,  2.81s/it]  [A
  1%|▏         | 16/1215 [01:55<4:05:29, 12.28s/it][A
  1%|▏         | 17/1215 [01:55<2:55:38,  8.80s/it][A
  1%|▏         | 18/1215 [01:56<2:04:52,  6.26s/it][A
  2%|▏         | 19/1215 [01:59<1:48:21,  5.44s/it][A
  2%|▏         | 20/1215 [02:32<4:32:09, 13.67s/it][A
  2%|▏         | 21/1215 [02:32<3:12:15,  9.66s/it][A
  2%|▏         | 22/1215 [02:33<2:21:06,  7.10s/it][A
  2%|▏         | 23/1215 [02:37<2:02:20,  6.16s/it][A
  2%|▏

In [30]:
df_sents['text'] = final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
df_sents['text']

2145505    does form of capitalism that generates increas...
5176690      seehofer wiedermal seehoferwitz seehofer lachen
2368511    deutschland wandern tonne lebensmittel laut bu...
6368951    niveau knapp champion league finale strafe abs...
2010560    mooomennt idee baum co0 sauerstoff waldbesitze...
1488465    verständlich medium fast 00 absolut uneitel pr...
2368365    schlimm orientieren arbeitnehmerpolitik spd kl...
5765230       täuschungsmanöver handelsgericht durchgehendfb
5367797                      kommentar bezug nehmen entfernt
2368643    wattebausch sogar aufnehmen behalten potus übe...
5367798                          ha beweisen löw überflüssig
2368611    kühnern debatte nix bewegen idee kommen genau ...
5709924    streng nehmen steuer versuch ressource preis g...
2010174    kurz beitrag clickbaiting locken klimagegner k...
5862563                   johnson scheinen drogen berauschen
2145687    does form of capitalism that generates increas...
5456641    präparieren p

In [34]:
df_sents.to_pickle('/mnt/data2/results_full_comments_lemma.pkl')

In [35]:
len(final)

12147166

In [37]:
df_sents['group'] = df_sents['group'].apply(lambda x: x if x % 2 == 0 else x - 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
! rm /mnt/data2/ptf/groups/zo_bi_*_full.txt

In [39]:
for year, group in df_sents.groupby('group'):
    print(year, group.shape)
    Path(f'/mnt/data2/ptf/groups/zo_bi_{year}_full.txt').write_text('\n'.join(group['text'].values) + '\n')


2010 (761148, 5)
2012 (975706, 5)
2014 (1813791, 5)
2016 (4028289, 5)
2018 (4568232, 5)
