This notebook reads from a csv, putting the data in a dask dataframe. The data consists of registration data: an accountid and corresponding email address, the user_type (subscriber or registered), the timestamp of the registration, the brand for which they registered and a free text field for a sentence. For both Belgium and Ireland, the notebooks checks every row for a subscription for the corresponding brand which started today, i.e. 'user_type' == "subscriber" and 'timestamp' = today. For these cases, a sentence gets translated in the corresponding language and added to the 'sentence' field. When this is done, the dataframe gets written to a csv file.

In [None]:
# getting some dask workers, because it is a lot of data.
dask_client = ut.setup_worker_dask()
dask_client.cluster.scale(20)

In [None]:
import dask.dataframe as dd
from datetime import date
from datetime import datetime
from google.cloud import translate_v2 as translate

In [None]:
# initializing the translater
credentials = {"ID": "YLufGTO14mWvJmWZ3roQY4hKm9A12s53BoNBHIyN", "SECRET": "5A0e7Sr0rIy6zYWQYT6AFuvb3vF6V8WzY9Uxj0MX"}
translate_client = translate.Client(credentials=credentials)

In [None]:
# the columns I want to read.
columns_to_read = ['accountid', 'email', 'user_type', 'timestamp', 'brand', 'sentence']

In [None]:
entities = {
    "be": {"language": "nl", "brand": "niewsblad.be"},
    "ie": {"language": "en", "brand": "independent.ie"}
}

In [None]:
# for all the entities
for entity in entities:
    
    # read the csv
    df = dd.read_csv('myfiles/file.csv', usecols=columns_to_read)

    # and check every row
    for row_index in range(len(df)):
        
        row = df.loc[row_index].compute()
        brand = row.brand[row_index]
        timestamp = row.timestamp[row_index]
        user_type = row.user_type[row_index]
        
        today = date.today()
        
        try:
            # if the brand is correct, the timestamp is today and the user_type is a subscriber
            if brand == entities[entity]["brand"]:
                if (datetime.fromtimestamp(int(timestamp)).date() == today) & (user_type == "subscriber"):

                    # then translate the sentence
                    sentence = translate_client.translate('Une phrase, que je veux vraiment ajouter au CSV', target_language=entities[entity]["language"])

                    # add the sentence to the row
                    def update_sentence(df, row_index, new_sentence):
                        df.loc[df.index == row_index, 'sentence'] = new_sentence
                        return df
                    meta = df.head(0)
                    df = df.map_partitions(update_sentence, row_index, sentence, meta=meta)

            # write to csv
            df.to_csv("myfiles/file.csv", single_file=True)
            
        except Exception as error: 
            print(error)