# Plot Word Occurences Over Time

In [1]:
import re
import json
import pandas as pd
import pathlib

import plotly.express as px

from collections import defaultdict

## Parameter

In [2]:
path_to_corpus = "../data/rebetiko_corpus.json"
output_path = "../outputs/plots/"

output_path = pathlib.Path(output_path)
output_path.mkdir(exist_ok=True, parents=True)

## Helper Class for the Word Concepts

In [3]:
def reg_word_starts_with(s):
    return r'\b%s\S*\b' % re.escape(s)

def reg_word_ends_with(s):
    return r'\b\S*%s\b' % re.escape(s)

def reg_word_contains(s):
    return r'\b\S*%s\S*\b' % re.escape(s)

def reg_word_equal_to(s):
    return r'\b%s\b' % re.escape(s)

class Concept:
    def __init__(self, concept_name):
        self.concept_name = concept_name
        self.search_patterns = []

        self.concept_occurences_per_year = defaultdict(int)
        self.total_word_occurences_per_year = defaultdict(int)

    # add a single pattern
    def add_pattern_word_contains_string(self, s):
        self.search_patterns.append(reg_word_contains(s))

    def add_pattern_word_is_string(self, s):
        self.search_patterns.append(reg_word_equal_to(s))

    def add_pattern_word_starts_with_string(self, s):
        self.search_patterns.append(reg_word_starts_with(s))

    def add_pattern_word_ends_with_string(self, s):
        self.search_patterns.append(reg_word_ends_with(s))

    # helper functions to add multiple patterns at once
    def add_patterns_word_contains_string(self, list_of_patterns):
        for pattern in list_of_patterns:
            self.add_pattern_word_contains_string(pattern)

    def add_patterns_word_is_string(self, list_of_patterns):
        for pattern in list_of_patterns:
            self.add_pattern_word_is_string(pattern)

    def add_patterns_word_starts_with_string(self, list_of_patterns):
        for pattern in list_of_patterns:
            self.add_pattern_word_starts_with_string(pattern)

    def add_patterns_word_ends_with_string(self, list_of_patterns):
        for pattern in list_of_patterns:
            self.add_pattern_word_ends_with_string(pattern)

    # main processing: count occurrences
    def count_occurences(self, text, year):
        if text is None:
            return

        word_count = 0
        concept_count = 0

        text = text.lower()

        word_count += len(re.findall(r'\w+', text))

        for pattern in self.search_patterns:
            concept_count += len(re.findall(pattern, text))

        self.concept_occurences_per_year[year] += concept_count
        self.total_word_occurences_per_year[year] += word_count

    def get_absolute_occurences(self, year):
        return self.concept_occurences_per_year[year]

    def get_relative_occurences(self, year):
        if self.total_word_occurences_per_year[year] == 0:
            return 0
        return self.concept_occurences_per_year[year] / self.total_word_occurences_per_year[year]


## Load Rebetiko Corpus

In [4]:
with open(path_to_corpus) as f:
    corpus_data = json.load(f)

corpus_data = corpus_data["RECORDS"]


## Define Word Concepts

In [5]:
concepts = dict()

concept = Concept("φουμάρω")
concept.add_patterns_word_is_string(["φουμάρω", "φουμάρεις", "φουμάρει", "φουμάρουμε", "φουμάρετε", "φουμάρουν", "φουμέρνω", "φουμέρνεις", "φουμέρνει", "φουμέρνουμε", "φουμέρνετε", "φουμέρνουν", "φούμαρα", "φούμαρες", "φούμαρε", "φουμάραμε", "φουμάρατε", "φούμαραν", "εφούμαρα", "εφούμαρες", "εφούμαρε", "εφουμάραμε", "εφουμάρατε", "εφούμαραν", "εφούμερνα", "εφούμερνες", "εφούμερνε", "εφουμέρναμε", "εφουμέρνατε", "εφούμερναν", "φούμερνα", "φούμερνες", "φούμερνε", "φουμέρναμε", "φουμέρνατε", "φουμέρνανε", "φουμάρανε"])
concepts[concept.concept_name] = concept

concept = Concept("καπνίζω")
concept.add_patterns_word_is_string(["καπνίζω", "καπνίζεις", "καπνίζει", "καπνίζουμε", "καπνίζετε", "καπνίζουν", "κάπνιζα", "κάπνιζες", "κάπνιζε", "καπνίζαμε", "καπνίζατε", "κάπνιζαν", "καπνίζανε", "κάπνισα", "κάπνισες", "κάπνισε", "καπνίσαμε", "καπνίσατε", "κάπνισαν", "εκάπνιζα", "εκάπνιζες", "εκάπνιζε", "εκάπνιζαμε", "εκαπνίζατε", "εκάπνιζαν", "εκάπνισα", "εκαπνίσαμε", "εκαπνίσατε", "εκάπνισαν", "καπνίσω", "καπνίσεις", "καπνίσει", "καπνίσουμε", "καπνίσετε", "καπνίσουν"])
concepts[concept.concept_name] = concept

# concept = Concept("λουλάς")
# concept.add_patterns_word_is_string(["λουλάς", "λουλά", "λουλάδες", "λουλαδιές", "λουλάδων", "λουλα", "λουλαδάκι"])
# concepts[concept.concept_name] = concept

# concept = Concept("χασίσι")
# concept.add_patterns_word_is_string(["χασίσι", "χασίς", "χασισιού", "χασίσια", "χασισιές"])
# concepts[concept.concept_name] = concept

# concept = Concept("μαύρο")
# concept.add_patterns_word_is_string(["μαύρο", "μαύρη", "μαυράκι", "μαυράκια"])
# concepts[concept.concept_name] = concept

# concept = Concept("αργιλές")
# concept.add_patterns_word_is_string(["αργιλές", "αργιλέ", "αργιλε", "αργιλέδες", "αργιλεδάκι", "ναργιλέ"])
# concepts[concept.concept_name] = concept


## Run Analysis

In [6]:
for song in corpus_data:
    year = song["year"]
    if year is None:
        continue
    lyrics = song["lyrics"]

    for key, concept in concepts.items():
        concept.count_occurences(lyrics, year)

## Create Pandas Dataframe from Results

In [7]:
collected_data_absolute = []
collected_data_relative = []

columns = ["year", "concept", "count"]
concept_names = list(concepts.keys())

for year in range(1900, 2000):
    for concept_name in concept_names:
        collected_data_absolute.append([year, concept_name, concepts[concept_name].get_absolute_occurences(year)])
        collected_data_relative.append([year, concept_name, concepts[concept_name].get_relative_occurences(year)])


results_abs_df = pd.DataFrame(collected_data_absolute, columns=columns)
results_rel_df = pd.DataFrame(collected_data_relative, columns=columns)

## Plot Absolute Occurrences

In [10]:
fig = px.line(results_abs_df, x="year", y="count", color="concept", width=1000, height=400)
fig.show()
fig.write_image(output_path / "absolute_occurrences.pdf")

## Plot Relative Occurrences

In [11]:
fig = px.line(results_rel_df, x="year", y="count", color="concept", width=1000, height=400)
fig.show()
fig.write_image(output_path / "relative_occurrences.pdf")