In [9]:
import os
import csv

# File reader function to read files from data source
def file_reader(dir_path, file_path):
    # Loop responsible for opening files
    data_set = []
    for file in os.listdir(dir_path):
        data = open(f'{file_path}/{file}', encoding='utf-8')
        csv_data = list(csv.reader(data))
        # Below loop applies key to every csv file cell. That data for both datasets would be homogenious
        headers = csv_data[0]
        for data_row in csv_data[1:]:
            my_dict = {}
            for i, cell in enumerate(data_row):
                my_dict[headers[i]] = cell
            data_set.append(my_dict)
    data.close()
    return data_set


In [10]:
dir_path_clicks = f'{os.getcwd()}/data/clicks'
file_path_clicks = 'data/clicks'

dir_path_users = f'{os.getcwd()}/data/users'
file_path_users = 'data/users'

data_set_clicks = file_reader(dir_path_clicks, file_path_clicks)
data_set_users = file_reader(dir_path_users, file_path_users)

print(data_set_clicks)
print(data_set_users)

[{'date': '2017-12-20', 'screen': 'feed', 'user_id': '3', 'click_target': 'ad'}, {'date': '2017-12-19', 'screen': 'catalog', 'user_id': '3', 'click_target': 'ad'}, {'date': '2017-12-21', 'screen': 'feed', 'user_id': '3', 'click_target': 'profile'}, {'date': '2017-12-10', 'screen': 'catalog', 'user_id': '7', 'click_target': 'ad'}, {'date': '2017-12-12', 'screen': 'feed', 'user_id': '3', 'click_target': 'ad'}, {'date': '2017-12-17', 'screen': 'feed', 'user_id': '3', 'click_target': 'item'}, {'date': '2017-12-13', 'screen': 'feed', 'user_id': '7', 'click_target': 'item'}, {'date': '2017-12-10', 'screen': 'feed', 'user_id': '7', 'click_target': 'profile'}, {'date': '2017-12-20', 'screen': 'feed', 'user_id': '7', 'click_target': 'profile'}, {'date': '2017-12-17', 'screen': 'catalog', 'user_id': '7', 'click_target': 'ad'}, {'date': '2017-12-12', 'screen': 'feed', 'user_id': '7', 'click_target': 'ad'}, {'date': '2017-12-13', 'screen': 'blog', 'user_id': '7', 'click_target': 'ad'}, {'date': '2

In [11]:
# Mapper function responsible of generating key-alue dictionary pairs and returning them
def mapper_task_1(data):
    results = []
    for click in data:
        key = click['date']
        value = click
        results.append(({'key': key, 'value': value}))
    return results

# Reducer funtion responsible for count, how many each day there were clicks
def reducer_task_1(key, values):
    result = []
    result.append({'date': key, 'count': len(values)})
    # Returning all key-value pairs
    return result


In [13]:
# TESTING mapper_task_1 (reducer will be tested with map_reduce implementation, when data will be grouped)
test_m_1 = mapper_task_1(data_set_clicks)
print(test_m_1)

[{'key': '2017-12-20', 'value': {'date': '2017-12-20', 'screen': 'feed', 'user_id': '3', 'click_target': 'ad'}}, {'key': '2017-12-19', 'value': {'date': '2017-12-19', 'screen': 'catalog', 'user_id': '3', 'click_target': 'ad'}}, {'key': '2017-12-21', 'value': {'date': '2017-12-21', 'screen': 'feed', 'user_id': '3', 'click_target': 'profile'}}, {'key': '2017-12-10', 'value': {'date': '2017-12-10', 'screen': 'catalog', 'user_id': '7', 'click_target': 'ad'}}, {'key': '2017-12-12', 'value': {'date': '2017-12-12', 'screen': 'feed', 'user_id': '3', 'click_target': 'ad'}}, {'key': '2017-12-17', 'value': {'date': '2017-12-17', 'screen': 'feed', 'user_id': '3', 'click_target': 'item'}}, {'key': '2017-12-13', 'value': {'date': '2017-12-13', 'screen': 'feed', 'user_id': '7', 'click_target': 'item'}}, {'key': '2017-12-10', 'value': {'date': '2017-12-10', 'screen': 'feed', 'user_id': '7', 'click_target': 'profile'}}, {'key': '2017-12-20', 'value': {'date': '2017-12-20', 'screen': 'feed', 'user_id': 

In [16]:
# CUSTOM mapReduce IMPLEMENTATION
# As number of datasets differs in both tasks, mapReduce takes list of datasets and list of mappers (if only one should be passed, it should be in a list as well)
def map_reduce(datasets, mappers, reducer):
    # Apply the mapper to the datasets
    intermediate_data = []
    for i, dataset in enumerate(datasets):
        mapper = mappers[i]
        intermediate_data += mapper(dataset)
    # Group the intermediate data by key
    grouped_data = {}
    for item in intermediate_data:
        if item['key'] in grouped_data:
            grouped_data[item['key']].append(item['value'])
        else:
            grouped_data[item['key']] = [item['value']]
    # Apply the reducer to each group of values
    final_data = []
    for key, values in grouped_data.items():
        final_data.extend(reducer(key, values))
    # Return the final output
    return final_data

In [18]:
# TESTING map_reduce + reducer for FIRST task
final_data_1 = map_reduce(datasets=[data_set_clicks], mappers=[mapper_task_1], reducer=reducer_task_1)
for data in final_data_1:
    print(data)

{'date': '2017-12-20', 'count': 13}
{'date': '2017-12-19', 'count': 8}
{'date': '2017-12-21', 'count': 7}
{'date': '2017-12-10', 'count': 4}
{'date': '2017-12-12', 'count': 12}
{'date': '2017-12-17', 'count': 12}
{'date': '2017-12-13', 'count': 9}
{'date': '2017-12-14', 'count': 10}
{'date': '2017-12-16', 'count': 10}
{'date': '2017-12-11', 'count': 4}
{'date': '2017-12-15', 'count': 6}
{'date': '2017-12-18', 'count': 5}
