## Import libraries

In [17]:
import pandas as pd
from argparse import Namespace
from pathlib import Path
from rantanplan.core import get_scansion

## Helper functions

In [2]:
def clean_text(string):
    output = string.strip()
    # replacements = (("“", '"'), ("”", '"'), ("//", ""), ("«", '"'), ("»",'"'))
    replacements = (
      ("“", ''), ("”", ''), ("//", ""), ("«", ''), ("»", ''), (",", ''),
      (";", ''), (".", ''),
        # ("?", ''), ("¿", ''), ("¡", ''), ("!", ''), ("-", ' '),
    )
    for replacement in replacements:
        output = output.replace(*replacement)
    # Any sequence of two or more spaces should be converted into one space
    #output = re.sub(r'(?is)\s+', ' ', output)
    output = re.sub(r'\t+', ' ', output)
    output = re.sub(r'\n{2,}', '', output)
    output = re.sub(r' {2,}', ' ', output)
    return output.strip()

In [3]:
def get_scansion_info(text):
    try:
        return get_scansion(clean_text(text), rhyme_analysis=True, pos_output=True, always_return_rhyme=True)
    except:
        return None

## Args

In [28]:
args = Namespace(
    source_csv= Path("plain_lyrics_dataset.csv"),
    filename= "rantanplan-plain-lyrics",
    save_dir = Path("rantanplan-data") / "plain-lyrics",
    chunksize = 10000, # number of rows by which the dataframe is to be divided
)

Read plain lyrics dataset file and add rantanplan scansion as a new column for each row. Then save the result in multiple  ".parquet" files.

From [docs](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-parquet): Apache Parquet provides a partitioned binary columnar serialization for data frames. It is designed to make reading and writing data frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible while still maintaining good read performance.

In [4]:
%%time
with pd.read_csv(args.source_csv, chunksize=args.chunksize) as reader:
    for index, chunk in enumerate(reader):
        chunk["id"] = pd.to_numeric(chunk["id"], downcast="unsigned")
        chunk["author"] = chunk["author"].astype("category")
        chunk['scansion'] = [get_scansion_info(text) for text in chunk['text']]
        chunk[['author','title', 'text', 'id', 'scansion']].to_parquet(args.folder / f"{args.filename}-{index}.parquet",
                                                                       compression="gzip")

CPU times: user 2h 23min 52s, sys: 5.3 s, total: 2h 23min 57s
Wall time: 2h 24min


Check the size of the generated files:

In [None]:
%%bash -s "$args.save_dir" 
ls -lh $1