In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [None]:
year = 2021
month = 12

In [None]:
df = pd.read_csv(f'data/news-{year}-{month:02d}.csv')

In [None]:
df.info()

# Translate

In [None]:
from deep_translator import GoogleTranslator

In [None]:
def translate_text(text):
    try:
        # there is a max 5000 char limit that can be translated at once
        max_char = 4999

        if len(text) == 0:
            return ''
        elif len(text) > max_char:
            text_list = []
            start = 0
            end = text[start:max_char].rfind('.')+1

            while start != end:
                text_list.append(text[start:end].strip())
                start = end
                end += text[end:end+max_char].rfind('.')+1

            translated_list = GoogleTranslator(source='et', target='en').translate_batch(text_list)
            return ' '.join(translated_list)

        else:
            return GoogleTranslator(source='et', target='en').translate(text)
    except:
        return 'Something went wrong'

In [None]:
text_list_est = list(df['articleText'])
lead_list_est = list(df['lead'])
head_list_est = list(df['heading'])

text_list_eng = []
lead_list_eng = []
head_list_eng = []

for h, l, t in zip(head_list_est, lead_list_est, text_list_est):
# for h in head_list_est:
    h_eng = translate_text(h)
    l_eng = translate_text(l)
    t_eng = translate_text(t)
    
    head_list_eng.append(h_eng)
    lead_list_eng.append(l_eng)
    text_list_eng.append(t_eng)
    
    now = datetime.datetime.now().strftime(format='%H:%M:%S')
    percent = round(len(head_list_eng)/len(head_list_est)*100, 4)
    
    b = h_eng == 'Something went wrong'
    
    print(f'{now} {b} {len(head_list_eng)}/{len(head_list_est)} {percent}%')
    
df['headingEng'] = head_list_eng
df['leadEng'] = lead_list_eng
df['articleTextEng'] = text_list_eng

# Lemmatize

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def lemmatize(text):
    try:
        return ' '.join([token.lemma_ for token in nlp(text.lower()) if token.pos_ in ['NOUN','ADJ','VERB','ADV']])
    except:
        return ''

In [None]:
df['headingEngLemma'] = df['headingEng'].apply(lemmatize)
df['leadEngLemma'] = df['leadEng'].apply(lemmatize)
df['articleTextEngLemma'] = df['articleTextEng'].apply(lemmatize)

In [None]:
cols = ['id', 'publishDate', 'updatedDate', 'url', 'headingEng', 'leadEng', 'articleTextEng',
       'headingEngLemma', 'leadEngLemma', 'articleTextEngLemma']

In [None]:
df[cols].to_csv(f'data/news-eng-lemmatized-{year}-{month:02d}.csv', index=False)

In [None]:
print(len(df.loc[df['headingEng']=='Something went wrong']))
print(len(df.loc[df['leadEng']=='Something went wrong']))
print(len(df.loc[df['articleTextEng']=='Something went wrong']))

print(len(df.loc[df['headingEngLemma']=='go wrong']))
print(len(df.loc[df['leadEngLemma']=='go wrong']))
print(len(df.loc[df['articleTextEngLemma']=='go wrong']))

# 0
# 248
# 311