# Imports

In [1]:
import pandas as pd

from googletrans import Translator

# Load the Data

In [2]:
languages = ['English', 'French', 'Greek']

#load the data into dataframes
#for each language
token_counts_dfs = {}
for language in languages:

    token_counts_dfs[language] = pd.read_csv(language + '/' + language.lower() + '_token_counts.csv')
    
    #only keep every 10th row for processing purposes and graphing
    token_counts_dfs[language] = token_counts_dfs[language][token_counts_dfs[language].index % 10 == 0]

# Translate the Lexemes

## English

### (add english translation variable to English dataframe just for good measure)

In [3]:
token_counts_dfs['English']['english_translation'] = token_counts_dfs['English']['most_recent_unique_lexeme']

token_counts_dfs['English'].head()

Unnamed: 0,lexemes_read,unique_lexemes_read,most_recent_unique_lexeme,log_lexemes_read,log_unique_lexemes_read,english_translation
0,1,1,chapter,0.0,0.0,chapter
10,11,10,man,2.397895,2.302585,man
20,21,17,want,3.044522,2.833213,want
30,31,25,view,3.433987,3.218876,view
40,41,30,enter,3.713572,3.401197,enter


In [4]:
#save to .csv
token_counts_dfs['English'].to_csv('English/english_token_counts.csv', index=False)###

## French

In [5]:
french_lexemes = list(set(token_counts_dfs['French']['most_recent_unique_lexeme']))###
fr_en_trans = {}###

In [14]:
translator = Translator()

k = len(fr_en_trans)#checkpoint if translator crashes
i = len(fr_en_trans)#counter

for fr_lex in french_lexemes[k:]:###
    
    en_trans = translator.translate(text=fr_lex, src='fr', dest='en').text###
    
    fr_en_trans[fr_lex] = en_trans###
    
    #counter
    if i % 100 == 0:
        print(i, 'lexemes translated')
    i += 1
    
token_counts_dfs['French']['english_translation'] = [fr_en_trans[fr_lex] for fr_lex in token_counts_dfs['French']['most_recent_unique_lexeme']]###

#save to .csv
token_counts_dfs['French'].to_csv('French/french_token_counts.csv', index=False)###

token_counts_dfs['French'].tail()###

Unnamed: 0,lexemes_read,unique_lexemes_read,most_recent_unique_lexeme,log_lexemes_read,log_unique_lexemes_read,english_translation
116760,116761,9048,envoie,11.667884,9.110299,send
116770,116771,9049,succédé,11.66797,9.11041,succolence
116780,116781,9049,succédé,11.668056,9.11041,succolence
116790,116791,9050,brèche,11.668141,9.11052,breach
116800,116801,9051,protège,11.668227,9.110631,protected


## Greek

In [20]:
greek_lexemes = list(set(token_counts_dfs['Greek']['most_recent_unique_lexeme']))###

el_en_trans = {}###

In [53]:
translator = Translator()

k = len(el_en_trans)#checkpoint if translator crashes
i = len(el_en_trans)#counter

for el_lex in greek_lexemes[k:]:###
    
    #to fix a really odd bug
    if el_lex.strip() == '':
        
        el_lex = ''
        en_trans = ''
        
    else:
        
        en_trans = translator.translate(text=el_lex, src='el', dest='en').text###
    
    el_en_trans[el_lex] = en_trans###
    
    #counter
    if i % 100 == 0:
        print(i, 'lexemes translated')
    i += 1    

token_counts_dfs['Greek']['english_translation'] = [el_en_trans[el_lex.strip()] for el_lex in token_counts_dfs['Greek']['most_recent_unique_lexeme']]###

#save to .csv
token_counts_dfs['Greek'].to_csv('Greek/greek_token_counts.csv', index=False)###

token_counts_dfs['Greek'].tail()###

Unnamed: 0,lexemes_read,unique_lexemes_read,most_recent_unique_lexeme,log_lexemes_read,log_unique_lexemes_read,english_translation
113810,113811,12558,τούσκαψαν,11.642294,9.438113,stuffed
113820,113821,12558,τούσκαψαν,11.642382,9.438113,stuffed
113830,113831,12559,τέλειωσαν,11.64247,9.438193,They ended
113840,113841,12560,ταχτικάς,11.642558,9.438272,tactics
113850,113851,12561,διόσπαρτος,11.642646,9.438352,dilateral


I'm never gonna use the googletrans package ever again...