In [None]:
import numpy as np
import gzip
import csv
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# We define the year of the study
year = 2010

## We find peak and period of interest from flu time series

In [None]:
flu_series = np.genfromtxt("../Data/Influenza/%d-%d_Italy.txt" %(year, year + 1), skiprows=1, dtype=None, usecols= 1)
sum_flu = sum(flu_series)
normal_flu_series = [x / sum_flu for x in flu_series]  # Series normalization

influenza_peak = max(flu_series) # Flu peak
influenza_index = list(flu_series).index(influenza_peak)

if influenza_index <= 10: # We match the index with the actual week of the year (reporting starts at week 42)
    week_peak = influenza_index + 42
else:
    week_peak = influenza_index - 10

window = [week_peak - 2, week_peak - 1, week_peak, week_peak + 1, week_peak + 2] # Window of interest -> peak +- 2 weeks
window_list = [str(x) for x in window]

## We find the products of interest - sentinel products


In [None]:
products = []
correlations = []
normal_product_series_dic = {}

# subcategory,value_week1, value_week2, ...
f_coop_weeks = gzip.open("../Data/COOP/coop_flu_weeks_%d_%d.csv.gz" %(year, year + 1))
file_reader = csv.reader(f_coop_weeks)
for line in file_reader:
    product = line[0]
    products.append(product)
    line.remove(product)
    product_series = map(float, line)
    product_series = np.array(product_series)
    sum_product = sum(product_series)
    normal_product_series = [x / sum_product for x in product_series]  # Series normalization
    normal_product_series_dic[product] = normal_product_series # We keep the time series of each product
    correlations.append(pearsonr(normal_flu_series, normal_product_series)[0])  # Calculate distance between series
f_coop_weeks.close()

In [None]:
print('Original number of products: ' + str(len(correlations)))

In [None]:
rounded_correlation = [round(float(i), 2) for i in correlations]
count = {x: rounded_correlation.count(x) for x in rounded_correlation}

In [None]:
freq = count.values()
keys = count.keys()
sum_freq = sum(freq)
y = [float(x) / sum_freq for x in freq]

In [None]:
plt.figure(figsize=(12, 5))
plt.scatter(keys, y)
plt.title('Correlation distribution %d' %year)
plt.ylabel('P(c)')
plt.xlabel('Correlation(c)')
plt.show()

### We notice that there are many products with a negative or zero correlation, so we filter them out setting a threshold of 0.2

In [None]:
# Produce the list of final subcategories with positive correlation
    
flu_products = []

for correlation in correlations:
    if correlation >= 0.2:
        index_correlation = correlations.index(correlation)
        flu_products.append(products[index_correlation])

In [None]:
print('Number of sentinel products: ' + str(len(flu_products)) + ' out of ' + str(len(correlations)) + ' original products')

## We find the customers of interest - sentinel customers and retrieve all their purchases during the period of interest


In [None]:
# Each client is of interest if he/she purchased at least one sentinel product during the period of interest

clients = []

# client,year,week,receipt,subcategory,quantity
f_coop_receipts = gzip.open("../Data/COOP/coop_flu_receipts_%d_%d.csv.gz" %(year, year + 1))
file_reader = csv.DictReader(f_coop_receipts, delimiter=',')
for line in file_reader:
    product = line['subcategory']
    week = line['week']
    if product in flu_products:
        if week in window_list: # period of interest
            clients.append(line['client'])
f_coop_receipts.close()

In [None]:
# Each basket contains only the flu_products from the same receipt

receipts = {}

f_coop_receipts = gzip.open("../Data/COOP/coop_flu_receipts_%d_%d.csv.gz" %(year, year + 1))
products = csv.DictReader(f_coop_receipts, delimiter=',')
for line in products:
    client = line['client']
    receipt = line['receipt']
    product = line['subcategory']o
    if client in clients:
        if product in flu_products:
            if receipt not in receipts:
                receipts[receipt] = []
            receipts[receipt].append(product)            
f_coop_receipts.close()

## We find the sentinel baskets with Apriori and we construct their time series

In [None]:
# We create a list with all the baskets as an input for the Apriori algorithm
baskets = []

for receipt in receipts:
    baskets.append(receipts[receipt])

In [None]:
from AprioriAlgorithm import apriori

minsupport = 0.01

frequent_baskets, support_data = apriori(baskets, minsupport)  # The variable support_data is just a dictionary with the support values of our frequent baskets.

In [None]:
# Series construction for each of the frequent baskets

frequent_baskets_series = {}
for baskets_set in frequent_baskets:
    baskets_set = map(list, baskets_set)
    if baskets_set != []:
        for basket in baskets_set:
            if len(basket) > 1: # We only keep the baskets with more than one products
                sum_basket = normal_product_series_dic[basket[0]]
                for i in range(1, len(basket)):
                    sum_basket = map(lambda a, b : a + b, sum_basket, normal_product_series_dic[basket[i]])
                frequent_baskets_series[tuple(basket)] = sum_basket

In [None]:
# Series normalization and correlation

normal_frequent_baskets_series = {}
correlations_baskets = {}
for basket in frequent_baskets_series:
    sum_series = sum(frequent_baskets_series[basket])
    normal_frequent_baskets_series[basket] = [x / sum_series for x in frequent_baskets_series[basket]] # Series normalization
    correlations_baskets[basket] = pearsonr(normal_flu_series, normal_frequent_baskets_series[basket])[0]  # Calculate distance between series

In [None]:
# We keep the top 5 baskets
top_5_baskets = dict(sorted([(k,v) for k, v in correlations_baskets.items()], key = lambda x: x[1])[-5:])

## Now that we have the sentinel baskets from last year, we construct the corresponding time series for the next year

In [None]:
next_year = year + 1

In [None]:
# We obtain the influenza series

next_flu_series = np.genfromtxt("../Data/Influenza/%d-%d_Italy.txt" %(next_year, next_year + 1), skiprows=1, dtype=None, usecols= 1)
next_sum_flu = sum(next_flu_series)
next_normal_flu_series = [x / next_sum_flu for x in next_flu_series]  # Series normalization

In [None]:
# For the sentinel products, we retrieve their time series

next_normal_product_series_dic = {}

# subcategory,value_week1, value_week2, ...
next_f_coop_weeks = gzip.open("../Data/COOP/coop_flu_weeks_%d_%d.csv.gz" %(next_year, next_year + 1))
file_reader = csv.reader(next_f_coop_weeks)
for line in file_reader:
    product = line[0]
    if product in flu_products:
        line.remove(product)
        product_series = map(float, line)
        product_series = np.array(product_series)
        sum_product = sum(product_series)
        normal_product_series = [x / sum_product for x in product_series]  # Series normalization
        next_normal_product_series_dic[product] = normal_product_series

next_f_coop_weeks.close()

In [None]:
# We construct the time series for the sentinel baskets

next_frequent_baskets_series = {}
for basket in top_5_baskets.keys():
    sum_basket = next_normal_product_series_dic[str(basket[0])]
    for i in range(1, len(basket)):
        sum_basket = map(lambda a, b : a + b, sum_basket, next_normal_product_series_dic[str(basket[i])])
    next_frequent_baskets_series[tuple(basket)] = sum_basket

In [None]:
# Sentinel basket series normalization
next_normal_frequent_baskets_series = {}

for basket in next_frequent_baskets_series:
    sum_series = sum(next_frequent_baskets_series[basket])
    if sum_series == 0.0:
        next_normal_frequent_baskets_series[basket] = [0.0 for x in next_frequent_baskets_series[basket]]
    else:
        next_normal_frequent_baskets_series[basket] = [x / sum_series for x in next_frequent_baskets_series[basket]]

## Our procedure is over, so we save our results as an input for the forecast models

In [None]:
# File containing each basket and its correlation

f_baskets_correlation = open("../Data/Sentinels/sentinel_baskets_correlation_%d-%d.csv" %(next_year, next_year + 1), 'w')

for key, value in sorted(top_5_baskets.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    f_baskets_correlation.write("%s : %s\n" % (key, value))
f_baskets_correlation.close()

In [None]:
# We obtain the lists with the weeks for this year (i.e. 2010-42, 2010-43, ...) and the next year (i.e. 2011-42, 2011-43, ...)

weeks = np.genfromtxt("../Data/Influenza/%d-%d_Italy.txt" %(year, year + 1), skiprows=1, dtype=None, usecols= 0)

next_weeks = np.genfromtxt("../Data/Influenza/%d-%d_Italy.txt" %(next_year, next_year + 1), skiprows=1, dtype=None, usecols= 0)


In [None]:
# Files for each of the top_5 sentinel baskets containing their normalized time series (for each week a value) 
# from the past and also for the next year
# week1, value_week1
# week2, value_week2
# ...

to_print = ""

for basket in top_5_baskets:
    f_time_series = open("../Data/Sentinels/time_series_of_sentinel_basket_" + str(basket) + "_%d-%d.csv" %(next_year, next_year + 1), 'w')
    
    f_time_series.write("Week TimeSeries\n")
    
    for i in range(0, len(weeks)):
        single_week = weeks[i]
        month = single_week.split("-")[1]
        ye = single_week.split("-")[0]
        if month[0] == '0':
            month = month.translate(None, '0')
            to_print = ye + "-" + month
        else:
            to_print = single_week
        f_time_series.write(to_print + " " + str(normal_frequent_baskets_series[basket][i]) + "\n")
    for i in range(0, len(next_weeks)):
        single_week = next_weeks[i]
        month = single_week.split("-")[1]
        yea = single_week.split("-")[0]
        if month[0] == '0':
            month = month.translate(None, '0')
            to_print = yea + "-" + month
        else:
            to_print = single_week
        f_time_series.write(to_print + " " + str(next_normal_frequent_baskets_series[basket][i]) + "\n")
f_time_series.close()