# Export Lyrics for Trend Analysis

In [1]:
import json
import pathlib

from collections import defaultdict
import math

# Define Parameters

In [2]:
path_to_corpus = "../data/rebetiko_corpus.json"
output_path = "../data/subcorpora/trend_analysis/"

output_path = pathlib.Path(output_path)
output_path.mkdir(exist_ok=True, parents=True)

## Load the Rebetiko Corpus

In [3]:
with open(path_to_corpus) as f:
    corpus_data = json.load(f)

corpus_data = corpus_data["RECORDS"]

## Export Lyrics by Year

In [4]:
lyrics_by_year = defaultdict(str)

adapted_output_path = output_path / "lyrics_by_year"
adapted_output_path.mkdir(exist_ok=True, parents=True)

counter = 0

for song in corpus_data:
    year = song["year"]
    if song["lyrics"] is None:
        continue
    if song["year"] is None:
        continue

    if year >= 1900 and year <= 2000:
        lyrics_by_year[year] += song["lyrics"] + "\n\n"

for year, lyrics in lyrics_by_year.items():
    filename = str(year) + ".txt"

    output_file = open(adapted_output_path / filename, "w")
    output_file.write(lyrics)
    output_file.close()

## Export Lyrics by Epoch

In [5]:
lyrics_by_decade = defaultdict(str)
lyrics_by_quinquennial = defaultdict(str)

for song in corpus_data:
    if song["year"] is None:
        continue
    if song["lyrics"] is None:
        continue
    if song["year"] <= 0:
        continue

    year = song["year"]

    decade = int(math.ceil((int(year) - 9) / 10.0)) * 10
    quinquennial = int(math.ceil((int(year) - 4) / 5.0)) * 5

    lyrics_by_decade[decade] += song["lyrics"] + "\n\n"
    lyrics_by_quinquennial[quinquennial] += song["lyrics"] + "\n\n"

adapted_output_path = output_path / "decades"
adapted_output_path.mkdir(exist_ok=True, parents=True)

for decade, lyrics in lyrics_by_decade.items():
    filename = str(decade) + "-" + str(decade + 9) + ".txt"

    output_file = open(adapted_output_path / filename, "w")
    output_file.write(lyrics)
    output_file.close()

adapted_output_path = output_path / "quinquennials"
adapted_output_path.mkdir(exist_ok=True, parents=True)

for quinquennial, lyrics in lyrics_by_quinquennial.items():
    filename = str(quinquennial) + "-" + str(quinquennial + 4) + ".txt"

    output_file = open(adapted_output_path / filename, "w")
    output_file.write(lyrics)
    output_file.close()


## Write Lyrics for Defined Timespans

In [6]:
timespans = [
    (1922, 1932),
    (1906, 1932),
    (1906, 1935),
    (1933, 1935),
    (1936, 1941),
    (1942, 1945),
    (1922, 1932),
    (1942, 1946),
    (1946, 1946),
    (1947, 1960),
    (1942, 1960),
    (1946, 1960),
    (1960, 1974),
    (1960, 1979),
    (1947, 1974),
    (1942, 1974),
    (1947, 1974),
    (1980, 1992),
    (1974, 1992),
    (1900, 1909),
    (1910, 1919),
    (1920, 1929),
    (1930, 1939),
    (1940, 1949),
    (1950, 1959),
    (1960, 1969),
    (1970, 1979),
    (1980, 1989),
    (1990, 1999),
]

lyrics_by_timespan = defaultdict(str)

for timespan in timespans:

    for song in corpus_data:
        year = song["year"]
        if song["lyrics"] is None:
            continue

        if year is not None:
            if year >= timespan[0] and year <= timespan[1]:
                lyrics_by_timespan[timespan] += song["lyrics"] + "\n\n"

adapted_output_path = output_path / "timespans"
adapted_output_path.mkdir(exist_ok=True, parents=True)

for timespan, lyrics in lyrics_by_timespan.items():
    filename = str(timespan[0]) + "-" + str(timespan[1]) + ".txt"

    output_file = open(adapted_output_path / filename, "w")
    output_file.write(lyrics)
    output_file.close()

## Write all Lyrics to Single File

In [7]:
all_lyrics = ""

for song in corpus_data:
    if song["lyrics"] is None:
        continue

    all_lyrics += song["lyrics"]
    all_lyrics += "\n\n"

output_file = open(output_path / "all_lyrics.txt", "w")
output_file.write(all_lyrics)
output_file.close()