## 1.0 Intro

This script is about getting morphosyntactic polish dictionary into some python-readable form and then lemmatizing and normalizing all songs

In [1]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint
import json
import os
from tqdm import tqdm
from collections import Counter
import numpy as np
import csv
import string
import re

You need polish morphosyntactic dictionary - you can find one on (link needed) 

In [2]:
MAIN_PATH = "/home/jack/datasets/polish_rap/"
DICTIONARIES_PATH = "/home/jack/datasets/polish_dictionaries/"

MORPHOSYNTACTIC_DICT = "slownik_morfosyntaktyczny.csv"
NORMALIZED_DICT = "slownik_znormalizowany.json"

STRONG_LANGUAGE_LIST = "strong_language.json"
STRONG_LANGUAGE_LEMMATIZED = "strong_language_lem.json"

## 1.1 Normalizing morphosyntactic dictionary 

In [3]:
def get_normalized_dict(force=False):
    if os.path.exists(DICTIONARIES_PATH + NORMALIZED_DICT) and not force:
        with open(DICTIONARIES_PATH + NORMALIZED_DICT, "r") as f:
            return json.load(f)
    
    def lemmatize_df(word):
        t = slownik_pl[slownik_pl[1] == word]
        return t[0].to_string(index=False).split()[0] if not t.empty else word
    
    if not os.path.exists(DICTIONARIES_PATH + MORPHOSYNTACTIC_DICT):
        raise Exception("There is no morphosyntactic dictionary in the path you specified.")
    
    with open(DICTIONARIES_PATH + MORPHOSYNTACTIC_DICT, "r") as f:
        slownik_pl = pd.read_csv(f, sep=";", header=None)

    slownik_pl.drop(2, axis=1, inplace=True)
    
    # sanity checks
    assert(slownik_pl[slownik_pl[1] == "ul"][0].to_string(index=False).split()[0] == "ul")
    assert(lemmatize_df("poszukiwani") == "poszukiwany")
    
    slownik_pl_dict = slownik_pl.set_index(1)[0].to_dict()
    slownik_pl_dict = {normalize(k): normalize(v) for k, v in slownik_pl_dict.items()}
    
    with open("/home/jack/datasets/polish_dictionaries/slownik_znormalizowany.json", "w") as f:
        json.dump(slownik_pl_dict, f, ensure_ascii=False)
    
    return slownik_pl_dict

In [4]:
slownik_pl_dict = get_normalized_dict()

In [5]:
def lemmatize(dictionary, word):
    return dictionary[word] if word in dictionary else word

In [6]:
def remove_noise(word):
    # function to remove noise, such as google adwords stuff...
    if "google" in word or len(word) > 30:
        return ""
    return word

In [7]:
good_letters = set(string.ascii_lowercase + "ąćłóężźńś")
def normalize(word):
    word = word.lower()
    word = "".join(c for c in word if c in good_letters)
    return word

In [8]:
# sanity check
assert(lemmatize(slownik_pl_dict, "poszukiwani") == "poszukiwać")

In [9]:
lemmatize(slownik_pl_dict, "zapalisz")

'zapalić'

## 1.2 Lemmatizing all songs 

In [10]:
def optionally_mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)

In [11]:
def lemmatize_all(force=False):
    rappers = os.listdir(MAIN_PATH + "ok_lyrics/")
    optionally_mkdir(MAIN_PATH + "stm_lyrics/")

    for rapper in tqdm(rappers):
        songs = os.listdir(MAIN_PATH + "ok_lyrics/" + rapper)
        optionally_mkdir(MAIN_PATH + "stm_lyrics/" + rapper)
        for song in songs:
            new_song = ""
            if not os.path.exists(MAIN_PATH + "stm_lyrics/" + rapper + "/" + song) or force:
                with open(MAIN_PATH + "ok_lyrics/" + rapper + "/" + song, "r") as f:
                    old_song_json = json.load(f)
                old_song = old_song_json["lyrics"]
                for line in old_song.split("\n"):
                    for word in line.split(" "):
                        word = normalize(word)
                        word = lemmatize(slownik_pl_dict, word)
                        word = remove_noise(word)
                        new_song += word
                        new_song += " "
                    new_song += "\n"
                #warning - it leaves space at the end of each line and newline at the end of file
                new_song_json = old_song_json
                new_song_json["lyrics"] = new_song
                with open(MAIN_PATH + "stm_lyrics/" + rapper + "/" + song, "w") as f:
                    old_song = json.dump(new_song_json, f, ensure_ascii=False)

In [12]:
# %%timeit
lemmatize_all(force=True)

100%|██████████| 82/82 [00:07<00:00, 10.85it/s]


## 1.3 Lemmatizing dictionary of strong vocabulary

In [13]:
def lemmatize_strongs():
    new_strongs = set()
    with open(DICTIONARIES_PATH + STRONG_LANGUAGE_LIST, "r") as f:
        strongs = json.load(f)
    for strong in strongs:
        new_strongs.add(lemmatize(slownik_pl_dict, strong))
    new_strongs = list(new_strongs)

    with open(DICTIONARIES_PATH + STRONG_LANGUAGE_LEMMATIZED, "w") as f:
        json.dump(new_strongs, f, ensure_ascii=False)
    

In [14]:
lemmatize_strongs()