### Sample data
This notebook samples 150.000 sentences from a corpora for the model prediction.
The sentences are cleaned and tokenized.

### Usage
Set the path to the corpora file in the second cell and adjust the spaCy model accordingly.

In [None]:
import numpy as np
import pandas as pd
import re
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language

In [None]:
corpora = "../data/corpora/eng_news_2020_1M-sentences.txt"

#nlp = spacy.load('sv_core_news_sm')
nlp = spacy.load('en_core_web_sm')

sample_size = 150000

output_file = f"../data/outputs/annotation_phase_2/SAMPLE_{corpora.split('/')[-1].split('.')[0]}[{sample_size}].csv"

In [None]:
df = pd.read_csv(corpora, delimiter="\t", header=None, names=["sentence"])
sample = df.sample(sample_size, random_state=1).reset_index(drop=True)

print(sample.shape)
display(sample.head(10))

In [None]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

In [None]:
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

In [None]:
ans = []
reason = {"size": 0, "punctuation": 0, "language": 0, "pos": 0, "shape": 0}
characters = ['']
max_size = 0

for sentence in sample["sentence"]:
    # remove characters that are not: a-z, A-Z, 0-9, space, comma, period, question mark, exclamation mark
    sentence = re.sub(r"[^a-zA-Z0-9,.!?' ]+", "", sentence)

    # replace double spaces with single space
    sentence = re.sub(r"\s+", " ", sentence)
    

    if len(sentence) > 300:
        reason["size"] += 1
        continue
    
    doc = nlp(sentence, disable=["ner"])

    # skip if punctuation is > 25% of all tokens in the sentence
    if np.mean([token.is_punct for token in doc]) > 0.25:
        reason["punctuation"] += 1
        continue

    # skip if sentence is not in English
    if doc._.language["language"] != "sv":
        reason["language"] += 1
        continue

    if len(ans) % 100 == 0:
        print(f"{(len(ans)/len(sample['sentence'])):.2f}%", end='\r')


    ans.append({
        "sentence": sentence,
        "lemmas": [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ != ' ']
    })

In [None]:
print(f"Total sentences: {len(sample)}")
print(f"Total sentences kept: {len(ans)}")
print(f"Total sentences removed: {len(sample) - len(ans)}")
print(f"Reasons for removal: {reason}")

In [None]:
df = pd.DataFrame(ans)
df.to_csv(output_file, sep="\t", index=False)
display(df.head())
longest = df["sentence"].apply(len).max()