# Prepare Data for Word Clouds

This notebook loads the Rebetiko Corpus and prepares the data for the generation of beautiful word clouds.

It mainly does two things:
- lemmatize the texts
- remove stop words

For better results I have added some extra stop words myself.

In [1]:
import spacy
from spacy.language import Language
from spacy.lookups import Lookups

import pathlib
import json
from tqdm import tqdm

nlp = spacy.load("el_core_news_lg")

## Define Parameters

In [3]:
path_to_corpus = "../data/rebetiko_corpus.json"
output_path = "../data/subcorpora/wordcloud/"

output_path = pathlib.Path(output_path)
output_path.mkdir(exist_ok=True, parents=True)

## Define the Stop Words

In [4]:
# Get default stop words from spacy
default_stop_words = nlp.Defaults.stop_words

# Define additional stop words
additional_stopwords = ["μου","σου","σ\'","κι","απ\'","μ\'", "\'μ\'", "ν\'","του","σ", "γι\'", "δώσ\'",
"θά","μες","σαι","βρεις", "-","σένανε" ,"διάφορα","σύνθημα", "βλέπω","δεις", "\'ρθει", "κάν\'", "λεν\'",
"πάω","τ","μ","εσύ","εσυ","νά","\'χα","την","και","τη","είπες", "ς","μια","συ", "\'συ", "τ\'", "είν\'",
"ένα","τι","της","σένα","λες","\'χεις","πες","κάνω","κάνει","κάνεις","έχεις", "έχω","\'χω","στ\'", "είδ\'",
"π\'","θε","θες","μες","είδα","είδες","πήρες","βρω","ναι","είν","λέω","πεις", "απ","δως","τ", "\'χε", "\'ναι", "μεσ\'",
"ρθω","πάρει","σα","ε","ξέρεις","ξέρω","δω","άλλονε","ό","θ\'","έκανες", "πάει", "έχε", "είμ\'", "τό", "μπορώ",
"θέλει","ποιος","πει","\'γω","ρθει","πάρε","πάρω","μπορώ","πας","λες","λένε", "λενε", "πάν\'", "\'χει",
"βρήκα","καμιά","μένα","κάνε","γίνω","λέγω","έλεγα","είπα","τί","τι","\'πα", "βρω", "\'ν\'"]

# Add them to spacy's list of stop words
nlp.Defaults.stop_words.update(additional_stopwords)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in additional_stopwords:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

## Load the Rebetiko Corpus and Prepare Data for Processing

In [5]:
with open(path_to_corpus) as f:
    corpus_data = json.load(f)

corpus_data = corpus_data["RECORDS"]

## Run Processing and Filter Stop Words

In [6]:
list_of_composers = ["all_composers", "Βασίλης Τσιτσάνης", "Άγνωστος", "Παναγιώτης Τούντας", "Γιάννης Παπαϊωάννου", "Μανώλης Χιώτης", "Σπύρος Περιστέρης", "Κώστας Σκαρβέλης", "Μάρκος Βαμβακάρης", "Γιάννης Δραγάτσης (Ογδοντάκης)", "Απόστολος Καλδάρας", "Κώστας Ρούκουνας", "Μιχάλης Γενίτσαρης", "Απόστολος Χατζηχρήστος", "Αντώνης Διαμαντίδης (Νταλγκάς)", "Στράτος Παγιουμτζής", "Σταύρος Παντελίδης", "Γιώργος Μητσάκης", "Δημήτρης Σέμσης (Σαλονικιός)", "Κώστας Καρίπης", "Μανώλης Χρυσαφάκης", "Σταύρος Τζουανάκος", "Δημήτρης Γκόγκος (Μπαγιαντέρας)", "Ιωάννα Γεωργακοπούλου", "Αγάπιος Τομπούλης", "Στέλιος Χρυσίνης", "Γιώργος Κατσαρός (Θεολογίτης)", "Βαγγέλης Παπάζογλου", "Μπάμπης Μπακάλης", "Γεωργία Μηττάκη", "Ιάκωβος Μοντανάρης", "Γρηγόρης Ασίκης", "Στελλάκης Περπινιάδης", "Μαρίνος Γαβριήλ (Μαρινάκης)", "Παναγιώτης Πετσάς"]


for composer in list_of_composers:
    print("Processing data for", str(composer))

    list_of_songs = []

    for song in corpus_data:
        if song["lyrics"] is None:
            continue
        if composer == "all_composers":
            list_of_songs.append(song["lyrics"].lower())
        elif song["composer"] is not None and composer in song["composer"]:
            list_of_songs.append(song["lyrics"].lower())

    songs_with_all_lyrics = list_of_songs
    songs_with_filtered_lyrics = []
    songs_with_filtered_lyrics_nouns = []
    songs_with_filtered_lyrics_nouns_lemmatized = []
    songs_with_filtered_lyrics_verbs_lemmatized = []
    songs_with_filtered_lyrics_lemmatized = []
    songs_with_filtered_lyrics_verbs = []
    songs_with_filtered_lyrics_adjectives = []
    songs_with_filtered_lyrics_adjectives_lemmatized = []

    docs = nlp.pipe(list_of_songs)

    for doc in tqdm(docs):
        tokens = [token.text for token in doc]
        tokens_filtered = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
        nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and (token.pos_ == "NOUN" or token.pos_ == "PROPN")]
        verbs = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB"]
        adjectives = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "ADJ"]
        corpus_lemmatized = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True]
        nouns_lemmatized = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True and (token.pos_ == "NOUN" or token.pos_ == "PROPN")]
        verbs_lemmatized = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB"]
        adjectives_lemmatized = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "ADJ"]

        if len(tokens_filtered) > 0:
            songs_with_filtered_lyrics.append(u' '.join(tokens_filtered))
        if len(nouns) > 0:
            songs_with_filtered_lyrics_nouns.append(u' '.join(nouns))
        if len(nouns_lemmatized) > 0:
            songs_with_filtered_lyrics_nouns_lemmatized.append(u' '.join(nouns_lemmatized))
        if len(verbs) > 0:
            songs_with_filtered_lyrics_verbs.append(u' '.join(verbs))
        if len(corpus_lemmatized) > 0:
            songs_with_filtered_lyrics_lemmatized.append(u' '.join(corpus_lemmatized))
        if len(verbs_lemmatized) > 0:
            songs_with_filtered_lyrics_verbs_lemmatized.append(u' '.join(verbs_lemmatized))
        if len(adjectives_lemmatized) > 0:
            songs_with_filtered_lyrics_adjectives_lemmatized.append(u' '.join(adjectives_lemmatized))
        if len(adjectives) > 0:
            songs_with_filtered_lyrics_adjectives.append(u' '.join(adjectives))


    # Write to Files
    files_to_write = [
        (songs_with_all_lyrics, composer + "_all.txt"),
        (songs_with_filtered_lyrics, composer + "_filtered.txt"),
        (songs_with_filtered_lyrics_nouns, composer + "_filtered_nouns.txt"),
        (songs_with_filtered_lyrics_lemmatized, composer + "_filtered_lemmatized.txt"),
        (songs_with_filtered_lyrics_nouns_lemmatized, composer + "_filtered_nouns_lemmatized.txt"),
        (songs_with_filtered_lyrics_verbs_lemmatized, composer + "_filtered_verbs_lemmatized.txt"),
        (songs_with_filtered_lyrics_adjectives_lemmatized, composer + "_filtered_adjectives_lemmatized.txt"),
        (songs_with_filtered_lyrics_verbs, composer + "_filtered_verbs.txt"),
        (songs_with_filtered_lyrics_adjectives, composer + "_filtered_adjectives.txt"),
    ]

    for file_to_write in files_to_write:
        all_lyrics = u"\n\n".join(file_to_write[0])

        with open(output_path / file_to_write[1], "w") as file:
            file.write(all_lyrics)

0it [00:00, ?it/s]

Processing data for all_composers


5097it [03:11, 26.66it/s] 
0it [00:00, ?it/s]

Processing data for Βασίλης Τσιτσάνης


582it [00:27, 20.95it/s]
0it [00:00, ?it/s]

Processing data for Άγνωστος


498it [00:08, 58.70it/s] 
0it [00:00, ?it/s]

Processing data for Παναγιώτης Τούντας


427it [00:20, 20.75it/s]
0it [00:00, ?it/s]

Processing data for Γιάννης Παπαϊωάννου


391it [00:12, 32.45it/s]
0it [00:00, ?it/s]

Processing data for Μανώλης Χιώτης


331it [00:12, 25.74it/s]
0it [00:00, ?it/s]

Processing data for Σπύρος Περιστέρης


255it [00:07, 32.12it/s]
0it [00:00, ?it/s]

Processing data for Κώστας Σκαρβέλης


206it [00:09, 20.62it/s]
0it [00:00, ?it/s]

Processing data for Μάρκος Βαμβακάρης


156it [00:07, 21.43it/s]
0it [00:00, ?it/s]

Processing data for Γιάννης Δραγάτσης (Ογδοντάκης)


123it [00:03, 33.92it/s]
0it [00:00, ?it/s]

Processing data for Απόστολος Καλδάρας


123it [00:04, 27.99it/s]
0it [00:00, ?it/s]

Processing data for Κώστας Ρούκουνας


92it [00:02, 39.02it/s]
0it [00:00, ?it/s]

Processing data for Μιχάλης Γενίτσαρης


91it [00:03, 27.24it/s]
0it [00:00, ?it/s]

Processing data for Απόστολος Χατζηχρήστος


90it [00:03, 26.19it/s]
0it [00:00, ?it/s]

Processing data for Αντώνης Διαμαντίδης (Νταλγκάς)


79it [00:02, 27.36it/s]
0it [00:00, ?it/s]

Processing data for Στράτος Παγιουμτζής


78it [00:01, 45.66it/s]
0it [00:00, ?it/s]

Processing data for Σταύρος Παντελίδης


92it [00:02, 31.57it/s]
0it [00:00, ?it/s]

Processing data for Γιώργος Μητσάκης


77it [00:03, 21.31it/s]
0it [00:00, ?it/s]

Processing data for Δημήτρης Σέμσης (Σαλονικιός)


62it [00:02, 21.53it/s]
0it [00:00, ?it/s]

Processing data for Κώστας Καρίπης


61it [00:03, 18.74it/s]
0it [00:00, ?it/s]

Processing data for Μανώλης Χρυσαφάκης


61it [00:01, 35.28it/s]
0it [00:00, ?it/s]

Processing data for Σταύρος Τζουανάκος


59it [00:02, 23.03it/s]
0it [00:00, ?it/s]

Processing data for Δημήτρης Γκόγκος (Μπαγιαντέρας)


55it [00:02, 26.05it/s]
0it [00:00, ?it/s]

Processing data for Ιωάννα Γεωργακοπούλου


46it [00:02, 22.68it/s]
0it [00:00, ?it/s]

Processing data for Αγάπιος Τομπούλης


42it [00:00, 108.91it/s]
0it [00:00, ?it/s]

Processing data for Στέλιος Χρυσίνης


45it [00:01, 24.49it/s]
0it [00:00, ?it/s]

Processing data for Γιώργος Κατσαρός (Θεολογίτης)


38it [00:00, 43.47it/s]
0it [00:00, ?it/s]

Processing data for Βαγγέλης Παπάζογλου


31it [00:01, 17.56it/s]
0it [00:00, ?it/s]

Processing data for Μπάμπης Μπακάλης


34it [00:01, 21.04it/s]
32it [00:00, 252.11it/s]
0it [00:00, ?it/s]

Processing data for Γεωργία Μηττάκη
Processing data for Ιάκωβος Μοντανάρης


31it [00:01, 26.31it/s]
0it [00:00, ?it/s]

Processing data for Γρηγόρης Ασίκης


26it [00:01, 20.90it/s]
0it [00:00, ?it/s]

Processing data for Στελλάκης Περπινιάδης


30it [00:00, 35.11it/s]
0it [00:00, ?it/s]

Processing data for Μαρίνος Γαβριήλ (Μαρινάκης)


13it [00:00, 56.37it/s]
0it [00:00, ?it/s]

Processing data for Παναγιώτης Πετσάς


20it [00:00, 22.91it/s]
