# Imports

In [None]:
import numpy as np
import pandas as pd
import spacy
from collections import Counter
import matplotlib.pyplot as plt
from googletrans import Translator

nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")
nlp_el = spacy.load("el_core_news_sm")
nlp_zh = spacy.load("zh_core_web_sm")

translator = Translator()

# English

In [None]:
#read the text
with open('English/Pride and Prejudice.txt','r') as file:###
    lines = file.read().splitlines()
    
#combine the lines into single string
text = ' '.join(lines)
text = text.lower()

#parse the text
doc = nlp_en(text)###

#tokenize the text into strings excluding punctuation, numbers and whitespace
tokens = [str(token) for token in doc if (not token.is_punct) and (not token.like_num) and (not token.is_space)]

#count the frequency of the tokens
freq = Counter(tokens)

#number of unique tokens
num_unique = len(freq)

#order the tokens by most frequent to least
freq = freq.most_common()#freq is now a list

#create a dataframe for word, frequency and rank
df = pd.DataFrame(freq, columns=['word', 'freq'])
df['rank'] = np.array(df.index) + 1

#compute relative frequency
df['rel freq'] = df['freq'] / num_unique

#compute the log rank base 10
df['log rank'] = np.log(df['rank']) / np.log(10)
#compute the log freq base 10
df['log freq'] = np.log(df['freq']) / np.log(10)
#compute the log rel freq base 10
df['log rel freq'] = np.log(df['rel freq']) / np.log(10)

#take just the top 5,000
df = df[:5000]

In [None]:
#add english translation
df['trans'] = df['word']

In [None]:
df_en = df

In [None]:
df_en.head()

In [None]:
df_en.to_csv('English/Pride_and_Prejudice_Frequency.csv', index=False)

In [None]:
plt.plot(df_en['rank'], df_en['freq'])

In [None]:
plt.plot(df_en['log rank'], df_en['log freq'])

# French

In [None]:
#read the text
with open('French/Madame Bovary.txt','r') as file:###
    lines = file.read().splitlines()
    
#combine the lines into single string
text = ' '.join(lines)
text = text.lower()

#parse the text
doc = nlp_fr(text)###

#tokenize the text into strings excluding punctuation, numbers and whitespace
tokens = [str(token) for token in doc if (not token.is_punct) and (not token.like_num) and (not token.is_space)]

#count the frequency of the tokens
freq = Counter(tokens)

#number of unique tokens
num_unique = len(freq)

#order the tokens by most frequent to least
freq = freq.most_common()#freq is now a list

#create a dataframe for word, frequency and rank
df = pd.DataFrame(freq, columns=['word', 'freq'])
df['rank'] = np.array(df.index) + 1

#compute relative frequency
df['rel freq'] = df['freq'] / num_unique

#compute the log rank base 10
df['log rank'] = np.log(df['rank']) / np.log(10)
#compute the log freq base 10
df['log freq'] = np.log(df['freq']) / np.log(10)
#compute the log rel freq base 10
df['log rel freq'] = np.log(df['rel freq']) / np.log(10)

#take just the top 5,000
df = df[:5000]

In [None]:
df_fr = df

In [None]:
df_fr.head()

In [None]:
plt.plot(df_fr['rank'], df_fr['freq'])

In [None]:
plt.plot(df_fr['log rank'], df_fr['log freq'])

In [None]:
translations = []

In [None]:
k = len(translations)#k is like a checkpoint if program crashes
i = len(translations)

for word in df_fr[k:]['word']:####
    translation = translator.translate(text=word, src='fr', dest='en').text####
    
    translations.append(translation)
    
    if i % 100 == 0:
        print(i)
        
    i += 1
    
df_fr['trans'] = translations####

In [None]:
df_fr.head()

In [None]:
df_fr.to_csv('French/Madame_Bovary_Frequency.csv', index=False)

# Greek

In [None]:
#read the text
with open('Greek/The Illiad (Ιλιάδα).txt','r') as file:###
    lines = file.read().splitlines()
    
#combine the lines into single string
text = ' '.join(lines)
text = text.lower()

#parse the text
doc = nlp_el(text)###

#tokenize the text into strings excluding punctuation, numbers and whitespace
tokens = [str(token) for token in doc if (not token.is_punct) and (not token.like_num) and (not token.is_space)]

#count the frequency of the tokens
freq = Counter(tokens)

#number of unique tokens
num_unique = len(freq)

#order the tokens by most frequent to least
freq = freq.most_common()#freq is now a list

#create a dataframe for word, frequency and rank
df = pd.DataFrame(freq, columns=['word', 'freq'])
df['rank'] = np.array(df.index) + 1

#compute relative frequency
df['rel freq'] = df['freq'] / num_unique

#compute the log rank base 10
df['log rank'] = np.log(df['rank']) / np.log(10)
#compute the log freq base 10
df['log freq'] = np.log(df['freq']) / np.log(10)
#compute the log rel freq base 10
df['log rel freq'] = np.log(df['rel freq']) / np.log(10)

#take just the top 5,000
df = df[:5000]

In [None]:
df_el = df####

In [None]:
df_el.head()####

In [None]:
plt.plot(df_el['rank'], df_el['freq'])###

In [None]:
plt.plot(df_el['log rank'], df_el['log freq'])###

In [None]:
translations = []

In [None]:
k = len(translations)#k is like a checkpoint if program crashes
i = len(translations)

for word in df_el[k:]['word']:####
    translation = translator.translate(text=word, src='el', dest='en').text####
    
    translations.append(translation)
    
    if i % 100 == 0:
        print(i)
        
    i += 1
    
df_el['trans'] = translations####

In [None]:
df_el.head()####

In [None]:
df_el.to_csv('Greek/The_Illiad_Frequency.csv', index=False)###

# Chinese

In [None]:
#read the text
with open('Chinese/Journey to the West (西遊記) First Half.txt','r') as file:###
    lines = file.read().splitlines()
    
#combine the lines into single string
text = ' '.join(lines)
text = text.lower()

#parse the text
doc = nlp_zh(text)###

#tokenize the text into strings excluding punctuation, numbers and whitespace
tokens = [str(token) for token in doc if (not token.is_punct) and (not token.like_num) and (not token.is_space)]

#count the frequency of the tokens
freq = Counter(tokens)

#number of unique tokens
num_unique = len(freq)

#order the tokens by most frequent to least
freq = freq.most_common()#freq is now a list

#create a dataframe for word, frequency and rank
df = pd.DataFrame(freq, columns=['word', 'freq'])
df['rank'] = np.array(df.index) + 1

#compute relative frequency
df['rel freq'] = df['freq'] / num_unique

#compute the log rank base 10
df['log rank'] = np.log(df['rank']) / np.log(10)
#compute the log freq base 10
df['log freq'] = np.log(df['freq']) / np.log(10)
#compute the log rel freq base 10
df['log rel freq'] = np.log(df['rel freq']) / np.log(10)

#take just the top 5,000
df = df[:5000]

In [None]:
df_zh = df####

In [None]:
df_zh.head()####

In [None]:
plt.plot(df_zh['rank'], df_zh['freq'])###

In [None]:
plt.plot(df_zh['log rank'], df_zh['log freq'])###

In [None]:
translations = []

In [None]:
k = len(translations)#k is like a checkpoint if program crashes
i = len(translations)

for word in df_zh[k:]['word']:####
    translation = translator.translate(text=word, dest='en').text####
    
    translations.append(translation)
    
    if i % 100 == 0:
        print(i)
        
    i += 1

In [None]:
df_zh['trans'] = translations####

In [None]:
df_zh.head(25)

In [None]:
df_zh.to_csv('Chinese/Journey_to_the_West_Frequency.csv', index=False)###