## 1. URL Access Count:

### Given a log file containing records of URLs accessed and their corresponding timestamps, use MapReduce to count the number of times each URL was accessed within a specific time window.

#### Importing libraries

In [1]:
from functools import reduce
from collections import defaultdict
import datetime

#### Reading a file

In [2]:
with open("access_log.txt","r")as file:
    text = file.readlines()

In [3]:
text

['2023-08-01 10:15:23 /page1\n',
 '2023-08-01 10:20:45 /page2\n',
 '2023-08-01 10:30:12 /page1\n',
 '2023-08-01 10:32:56 /page3\n',
 '2023-08-01 10:35:09 /page2']

#### Converting lines to lists

In [4]:
rows = [item.split(' ') for item in text]

rows

[['2023-08-01', '10:15:23', '/page1\n'],
 ['2023-08-01', '10:20:45', '/page2\n'],
 ['2023-08-01', '10:30:12', '/page1\n'],
 ['2023-08-01', '10:32:56', '/page3\n'],
 ['2023-08-01', '10:35:09', '/page2']]

#### Converting time fields from strings to timestamps and stripping url fields:

In [5]:
data_list = []

for i in rows:
    temp_list = []
    time_converted = datetime.datetime.strptime(i[0]+" "+i[1], "%Y-%m-%d %H:%M:%S")
    temp_list.extend([i[0],time_converted.time(),i[2].strip()])
    data_list.append(temp_list)

data_list

[['2023-08-01', datetime.time(10, 15, 23), '/page1'],
 ['2023-08-01', datetime.time(10, 20, 45), '/page2'],
 ['2023-08-01', datetime.time(10, 30, 12), '/page1'],
 ['2023-08-01', datetime.time(10, 32, 56), '/page3'],
 ['2023-08-01', datetime.time(10, 35, 9), '/page2']]

#### Defining start and end time for a specific time window:

In [6]:
start = datetime.time(10, 0, 0)
end = datetime.time(10, 33, 0)

#### Defining map function:

In [7]:
def map_function(line):

    page_count = defaultdict(int)
    
    if start <= line[1] <= end:    
        page_count[line[2]] +=1
      
    return page_count.items()

#### Mapping data:

In [8]:
mapped_data = map(map_function,data_list)

#### Shuffling and sorting values:

In [9]:
intermediate = defaultdict(list)
for url_count_pairs in mapped_data:
    for url, count in url_count_pairs:
        intermediate[url].append(count)

In [10]:
print(intermediate)

defaultdict(<class 'list'>, {'/page1': [1, 1], '/page2': [1], '/page3': [1]})


#### Defining reduce function:

In [11]:
def reduce_function(map,counts):
    return map, sum(counts)

#### Reducing

In [12]:
url_counts = []

for item in intermediate.items():
    result = reduce(reduce_function,item)
    url_counts.append(result)

In [13]:
for url_count in url_counts:
    print(url_count)

('/page1', 2)
('/page2', 1)
('/page3', 1)
