In [19]:
from collections import Counter
import unicodedata
import pandas
import re
from prettytable import PrettyTable

import pandas as pd

In [45]:
brandeis_df = pd.read_excel('data/dataYiddish/brandeis/Yiddish titles export.xlsx')
brandeis_df.head()

Unnamed: 0,title,trans
0,אלבאטראס.,Albaṭros.
1,אלע ווערק / פון דוד פרישמאן.,Ale ṿerḳ / fun Daṿid Frishman.
2,עלון המורה : כלי מבטא של מורי תנועת ״השומר הצע...,Alon ha-moreh : Keli mivṭa shel more tenuʻot ʻ...
3,אמאל אין א האלבן יובל / יוסף מענדעלסאן.,Amol in a halbn yoyvl / Yosef Mendelson.
4,אמאל אין א יובל : זאמלבוך פאר בעלעטריסטיק.,Amol in a yoyvl : zamlbukh far beleṭrisṭiḳ.


In [46]:
# converting to the columns to lists for easier operations

heb_titles = list(brandeis_df['title'])
trnscrpt_titles = list(brandeis_df['trans'])

In [47]:
with open('data/dataYiddish/hebis/json_parallels.txt') as f:
    text = f.read()
pairs = re.findall('\["(.+?)", "(.+?)"\]', text)
heb_titles += [pair[1] for pair in pairs]
trnscrpt_titles += [pair[0] for pair in pairs]

In [48]:
df = pd.read_csv('data/dataYiddish/names/names.csv')
heb_titles += list(df.yiddish)
trnscrpt_titles += list(df.english)

In [49]:
df = pd.read_excel('data/dataYiddish/nli/potsdam1.xlsx', header=None, names=['id', 'title', 'trans'])
df['title_'] = df.title.apply(lambda x: re.sub('\$\$[a-z]', '', x))
df['trans_'] = df.trans.apply(lambda x: re.sub('\$\$[a-z]', '', x))
heb_titles += list(df.title_)
trnscrpt_titles += list(df.trans_)

In [50]:
df = pd.read_csv('data/dataYiddish/nouns/multi_orthography_parallel_corpus_of_yiddish_nouns.csv')
heb_titles += list(df.yivo)
trnscrpt_titles += list(df.romanized)

In [51]:
df = pd.read_csv('data/dataYiddish/proverbs/yiddish-wit.csv')
heb_titles += list(df.yiddish)
trnscrpt_titles += list(df.transliteration)

In [52]:
df = pd.read_csv('data/dataYiddish/refoyl/wordlist.csv')
heb_titles += list(df.Yiddish)
trnscrpt_titles += list(df.Romanized)

In [53]:
df = pd.read_csv('data/dataYiddish/songtexts/lider-verter.csv')
heb_titles += list(df.yiddish)
trnscrpt_titles += list(df.transliteration)

In [54]:
df = pd.read_csv('data/dataYiddish/ybc/all-titles-ybc.csv')
heb_titles += list(df.title)
heb_titles += list(df.author)
trnscrpt_titles += list(df['title-trans'])
trnscrpt_titles += list(df['author-trans'])

In [55]:
with open('data/dataYiddish/yiddishland/yiddishland') as f:
    lines = f.readlines()[3:]
triplets = [line.split() for line in lines if len(line.split()) == 3]
heb_titles += [line[2] for line in triplets]
trnscrpt_titles += [line[0] for line in triplets]

In [56]:
# collecting two lists of chars for both lists/columns/"languages", seeing to it they are strings

clean_heb_titles = [title for title in heb_titles if isinstance(title, str)]
clean_trnscrpt_titles = [title for title in trnscrpt_titles if isinstance(title, str)]

In [57]:
# sorting chars by freqs 

heb_chr_freqs = Counter([c for title in clean_heb_titles for c in title]).most_common()
trnscrpt_chr_freqs = Counter([c for title in clean_trnscrpt_titles for c in title]).most_common()

In [58]:
# a function that prints a nicer table of information, also includes the unicode name for each char

def sort_chars(char_list):
    no_names = []

    table = PrettyTable(['FREQ', 'NAME', 'CHAR'], align='l', min_width=10)
    for c in char_list:
        try: 
            table.add_row([c[1], unicodedata.name(c[0]), c[0]])
        except ValueError:
            no_names.append(c)
    return table, no_names
        

In [59]:
# maybe check where these are coming from 

heb_table, heb_no_names = sort_chars(heb_chr_freqs)
heb_no_names

[('\n', 3)]

In [60]:
# look down the table, there are many non-heb characters. Is it encoding issues or part of the trnscprt system? Just noise?

print(heb_table)

+------------+---------------------------------------+------------+
| FREQ       | NAME                                  | CHAR       |
+------------+---------------------------------------+------------+
| 294141     | HEBREW LETTER AYIN                    | ע          |
| 261698     | HEBREW LETTER YOD                     | י          |
| 243039     | HEBREW LETTER ALEF                    | א          |
| 221995     | SPACE                                 |            |
| 206412     | HEBREW LETTER RESH                    | ר          |
| 146437     | HEBREW LETTER VAV                     | ו          |
| 135459     | HEBREW LETTER NUN                     | נ          |
| 133408     | HEBREW LETTER TET                     | ט          |
| 109306     | HEBREW LETTER LAMED                   | ל          |
| 107155     | HEBREW LETTER QOF                     | ק          |
| 104178     | HEBREW LETTER DALET                   | ד          |
| 100063     | HEBREW LETTER FINAL NUN          

In [61]:
# looks cleaner 

trnscrpt_table, trnscrpt_no_names = sort_chars(trnscrpt_chr_freqs)
trnscrpt_no_names

[('\n', 1), ('\x7f', 1)]

In [62]:
# a long tails of disturbing chars. Noise? 

print(trnscrpt_table)

+------------+----------------------------------------+------------+
| FREQ       | NAME                                   | CHAR       |
+------------+----------------------------------------+------------+
| 341115     | LATIN SMALL LETTER E                   | e          |
| 229626     | LATIN SMALL LETTER N                   | n          |
| 207855     | SPACE                                  |            |
| 200683     | LATIN SMALL LETTER R                   | r          |
| 183370     | LATIN SMALL LETTER I                   | i          |
| 181048     | LATIN SMALL LETTER S                   | s          |
| 159496     | LATIN SMALL LETTER A                   | a          |
| 142730     | LATIN SMALL LETTER T                   | t          |
| 123145     | LATIN SMALL LETTER H                   | h          |
| 116486     | LATIN SMALL LETTER K                   | k          |
| 113045     | LATIN SMALL LETTER O                   | o          |
| 103332     | LATIN SMALL LETTER 

In [66]:
heb_tokens = [word for title in clean_heb_titles for word in title.split()]
trns_tokens = [word.lower() for title in clean_trnscrpt_titles for word in title.split()]

In [64]:
Counter(heb_tokens).most_common()

[('פון', 11661),
 (':', 8670),
 ('און', 8395),
 ('דער', 5546),
 ('אין', 5464),
 ('די', 4162),
 ('א', 3621),
 ('/', 3494),
 (';', 2488),
 ('...', 1743),
 ('לידער', 1440),
 ('פאר', 1204),
 ('מיט', 1161),
 ('/פון', 1121),
 ('דאס', 991),
 ('ראמאן', 990),
 ('-', 959),
 ('צו', 873),
 ('געשיכטע', 859),
 ('לעבן', 819),
 ('אויף', 778),
 ('משה', 723),
 ('דעם', 715),
 ('יעקב', 715),
 ('דערציילונגען', 676),
 ('ווערק', 670),
 ('מיין', 657),
 ('יידישע', 655),
 ('ישראל', 649),
 ('יצחק', 646),
 ('י.', 632),
 ('אברהם', 604),
 ('יידיש', 580),
 ('זיין', 579),
 ('דוד', 540),
 ('אלע', 537),
 ('שלום', 526),
 ('ספר', 513),
 ('וועגן', 506),
 ('יידישער', 484),
 ('יוסף', 475),
 ('א.', 474),
 ('בן', 465),
 ('איז', 447),
 ('חיים', 443),
 ('אידישע', 428),
 ('איבערזעצט', 427),
 ('בילדער', 416),
 ('בוך', 413),
 ('געקליבענע', 407),
 ('צום', 400),
 ('אנדערע', 399),
 (':א', 394),
 ('וועלט', 389),
 ('זכרונות', 389),
 ('שלמה', 387),
 ('קינדער', 380),
 ('יאר', 376),
 ('יידישן', 376),
 ('מ.', 366),
 ('יידן', 363),
 ('דורך'

In [67]:
Counter(trns_tokens).most_common()

[('fun', 11014),
 ('un', 8840),
 (':', 6086),
 ('der', 5966),
 ('in', 5950),
 ('di', 4900),
 ('a', 4257),
 ('/', 4232),
 ('lider', 1984),
 ('...', 1502),
 ('roman', 1398),
 (';', 1376),
 ('far', 1252),
 ('dos', 1129),
 ('tsu', 975),
 ('dertseylungen', 964),
 ('lebn', 931),
 ('yidishe', 789),
 ('oyf', 782),
 ('mayn', 738),
 ('dem', 715),
 ('zayn', 678),
 ('miṭ', 637),
 ('yidisher', 633),
 ('geshikhṭe', 586),
 ('ale', 572),
 ('ṿerḳ', 561),
 ('b.', 545),
 ('=', 514),
 ('yidn', 474),
 ('tsum', 456),
 ('bilder', 455),
 ('a.', 443),
 ('andere', 442),
 ('yidishn', 439),
 ('iz', 436),
 ('y.', 435),
 ('sholem', 433),
 ('sefer', 432),
 ('yor', 423),
 ('eseyen', 414),
 ('idishe', 406),
 ('ṿegn', 404),
 ('bukh', 399),
 ('ben', 394),
 ('zikhroynes̀', 390),
 ('yidish', 384),
 ('m.', 363),
 ('biz', 360),
 ('geḳlibene', 340),
 ('dray', 339),
 ('leben', 328),
 ('an', 313),
 ('isaac', 304),
 ('leib,', 301),
 ('iber', 294),
 ('yitsḥaḳ', 291),
 ('liṭeraṭur', 291),
 ('poemes', 280),
 ('bay', 279),
 ('miṭ'