In [8]:
import pandas as pd
import json
import spacy
import concurrent.futures
import multiprocessing
import time

In [6]:
nlp = spacy.load('de_core_news_md')

In [4]:
multiprocessing.cpu_count()

8

In [None]:
def create_sentences_for_vec(df):

    corpus = []

    for i in tqdm(df.index):
        text = df.loc[i, 'full_text']
        doc = nlp(text)

        for sent in doc.sents:
            sent_list = []
            for word in sent:
                if not word.is_stop:
                    sent_list.append(word.text)

        corpus.append(sent_list)

    return corpus


if __name__ == '__main__':
    
    cpu_count = multiprocessing.cpu_count()
    
    all_sentences = []

    print(f'Starting main')
    start = time.perf_counter()

    print('Importing df')
    df = pd.read_parquet('../data/processed/RZ_sample.parquet')
    
    print('Processing')
    with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count) as executor:
        frames = cpu_count*[df.sample(100)]
        results = [executor.submit(create_sentences_for_vec, df) for df in frames]

        for f in concurrent.futures.as_completed(results):
            print(len(f.result()))
            print(type(f.result()))
            all_sentences += f.result()
            
    with open('../temp/sentences_test.json', 'w', encoding='utf8') as f:
        json.dump(all_sentences, f)
            

    stop = time.perf_counter()

    print(f'Finished in {round(stop-start, 2)} seconds')

In [1]:
import json

In [2]:
with open('../temp/sentences_test.json', 'r', encoding='utf8') as f:
    sentences = json.load(f)

In [5]:
phrases = 0

for part in sentences:
    phrases += len(part)
    
print(phrases)

7992


In [6]:
import numpy as np

In [39]:
sentences[np.random.randint(800)]

['ae']

In [8]:
len(sentences)

800

In [10]:
sentences[2]

['—',
 'd—',
 '\n',
 'b.',
 '(',
 'Polizei',
 ')',
 'eingeliefert',
 'herre',
 '»',
 'loser',
 'Hund',
 '.']

In [45]:
def divide_df(df, n):
    
    part_len = round(len(df)/n)
    
    for i in range(n):
        yield df.iloc[i*part_len:(i+1)*part_len]    

In [40]:
import pandas as pd

In [41]:
df = pd.read_parquet('../data/processed/RZ_processed.parquet')

In [47]:
parts = divide_df(df, 8)

In [44]:
round(len(df)/2)

144852

In [58]:
next(parts)

StopIteration: 