In [386]:
import pandas as pd
import numpy as np
import warnings
from statistics import mean
warnings.filterwarnings("ignore")

<h3>Pre Processing

In [387]:
# Reading the Train Data 
df=pd.read_csv('./Groceries_data_train.csv')

In [388]:
# Extracting the Items Column to better understand the Data
df_item=df['itemDescription']

In [389]:
# Extracting sorted Member Number for calculation 
df_number=sorted(df.Member_number.unique().tolist())

In [390]:
# Dictionary for member number for storing their index value
num_dict={number:index for index,number in enumerate(df_number)}

In [391]:
# Grouping all the purchase of item with Member Number
df_group=df[['Member_number','itemDescription']]

In [392]:
group=df_group.groupby('Member_number').agg(lambda x: x.tolist())

Creating Dictionary to store the Item and the number of times sold to sort the resulting recommendation with the number of times sold order

In [393]:
unique_elements, counts_elements = np.unique(df_item, return_counts=True)

In [394]:
my_pd=pd.DataFrame(data=[unique_elements,counts_elements]).T.rename(columns={0: 'item', 1: 'quantity'})

In [395]:
dictionary = my_pd.groupby('item').apply(lambda dfg: dfg['quantity'].iloc[0]).to_dict()

To create the utility matrix with the purchase history of every customer to items bought 

In [396]:
df_utility=pd.DataFrame()

In [397]:
for items in df_item.unique():
    df_utility[items]=[0]*len(df_number)

In [398]:
array_2d=df_utility.values

In [399]:
num_array=np.arange(len(df_number))
item_dict=dict(zip(df_item.unique(),num_array))

In [400]:
row=0
for i in df_number:
    for j in group.itemDescription[i] :
         array_2d[row][item_dict.get(j)]+=1 
    row+=1 

In [401]:
purchase_matrix=array_2d

<h3>Defining Functions for calculations

In [402]:
# to calculate the Jaccard similarity of two given vectors
def jaccard_similarity(vector1, vector2):
    intersection = np.logical_and(vector1, vector2)
    union = np.logical_or(vector1, vector2)
    similarity = np.sum(intersection) / np.sum(union)
    return similarity

In [403]:
# to calculate the Cosine similarity of two given vectors
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    similarity = dot_product / norm_product
    return similarity

In [404]:
# calculating the Similarity matrix using the utility matrix

def calculate_similarity(purchase_matrix, similarity_metric):
    # Calculate the similarity matrix based on the purchase matrix
    num_customers = purchase_matrix.shape[0]
    similarity_matrix = np.zeros((num_customers, num_customers))

    for i in range(num_customers):
        for j in range(num_customers):
            # Get the purchase vectors of customers i and j
            purchase_vector_i = purchase_matrix[i]
            purchase_vector_j = purchase_matrix[j]

            # Calculate similarity based on the chosen metric
            if similarity_metric == 'jaccard':
                similarity_score = jaccard_similarity(purchase_vector_i, purchase_vector_j)
            elif similarity_metric == 'cosine':
                similarity_score = cosine_similarity(purchase_vector_i, purchase_vector_j)
            else:
                raise ValueError('Invalid similarity metric specified.')

            # Store the similarity score in the similarity matrix
            similarity_matrix[i, j] = similarity_score

    return similarity_matrix

<h3>Calculating Similartiy and Finding the Neighbors

In [405]:
# this function determines the neighors for every member using the similarity matrix and the number of neighbors as parameters
def select_neighbors(similarity_matrix, k):
    # Select the top-k neighbors for each customer
    num_customers = similarity_matrix.shape[0]
    neighbors = []

    for i in range(num_customers):
        # Sort the similarity scores in descending order and get the indices of the top-k neighbors
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]
        neighbors.append(top_k_indices)

    return neighbors

In [406]:
similarity_matrix = calculate_similarity(purchase_matrix, similarity_metric='jaccard')

In [446]:

# Define the size of the neighborhood (e.g., top-k similar customers)
k = 5
# Select the neighborhood for each target customer
neighborhood = []
for i in range(len(similarity_matrix)):
    # Get the similarities for the current customer
    similarities = similarity_matrix[i]

    # Sort the similarities in descending order and get the indices
    sorted_indices = sorted(range(len(similarities)), key=lambda x: similarities[x], reverse=True)

    # Select the top-k similar customers (excluding the target customer itself)
    neighbors = sorted_indices[1:k+1]  # Exclude the target customer (index 0)

    # Add the neighbors to the neighborhood list
    neighborhood.append(neighbors)


In [447]:
# Creating recommendation,a list of lists to store the recommendation of every Member
recommendation=[]
for i,list in enumerate(neighborhood):
    grouped_items=[]
    for items in list:
       a=group.iloc[items] 
       for i in a:
           for b in i:
             if b not in grouped_items:
               grouped_items.append(b)
    recommendation.append(grouped_items)



In [448]:
# Creating Frequency to get all the number of items sold in sequence to the recommendation
frequency=[]
for i in recommendation:
    sort=[]
    for j in i:
       sort.append(dictionary.get(j))
    frequency.append(sort)

In [449]:
# this function can be modified to get the top 5,10 or n recommendations for every customer by using N
# N=5
top_n=[]
recommend=[]
for i in range(len(recommendation)):
   to_sort= [(x, y) for x, y in zip(recommendation[i],frequency[i])]
   sorted_t = sorted(to_sort, key=lambda x: x[1], reverse=True) [:5]#[:N] to get the top n recommendation
   mt=[]
   for items in sorted_t:
      mt.append(items[0])
   top_n.append(mt)
   recommend.append(mt)

<h3>Scoring the Recommendations for every customer

The F1 score is a metric that combines precision and recall into a single value, providing a balanced measure of a model's performance. It is particularly useful in scenarios where you want to consider both precision and recall simultaneously.

In [431]:
ground_truth=[]
for ind,val in group.itemDescription.iteritems():
     ground_truth.append(val)

In [432]:
# Function to Calculate Precision Score
def precision(actual, predicted):
        true_positives = len(set(actual).intersection(predicted))
        return true_positives / len(predicted)
   

In [433]:
# Function to calculate Recall Score
def recall(actual, predicted):
    true_positives = len(set(actual).intersection(predicted))
    return true_positives / len(actual)

In [434]:
# Function to calculate f1 Score
def f1_score(actual, predicted):
    precision_score = precision(actual, predicted)
    recall_score = recall(actual, predicted)
    
    if precision_score + recall_score == 0:
        f1 = 0.0  # or any appropriate default value
    else:
        f1 = 2 * ((precision_score * recall_score) / (precision_score + recall_score))
    
    return f1

In [435]:
# Calculating F1 Score
for i in range(len(ground_truth)):
    f1=f1_score(ground_truth[i],recommendation[i])
    print(f"Evaluation of Recommendation for customer {df_number[i]} : {f1}")


Evaluation of Recommendation for customer 1000 : 0.39999999999999997
Evaluation of Recommendation for customer 1001 : 0.5294117647058824
Evaluation of Recommendation for customer 1002 : 0.4347826086956522
Evaluation of Recommendation for customer 1003 : 0.4444444444444444
Evaluation of Recommendation for customer 1004 : 0.5116279069767441
Evaluation of Recommendation for customer 1005 : 0.5
Evaluation of Recommendation for customer 1006 : 0.5384615384615384
Evaluation of Recommendation for customer 1008 : 0.4
Evaluation of Recommendation for customer 1009 : 0.5263157894736842
Evaluation of Recommendation for customer 1010 : 0.3225806451612903
Evaluation of Recommendation for customer 1011 : 0.5405405405405406
Evaluation of Recommendation for customer 1012 : 0.5625000000000001
Evaluation of Recommendation for customer 1013 : 0.5555555555555556
Evaluation of Recommendation for customer 1014 : 0.7058823529411764
Evaluation of Recommendation for customer 1015 : 0.4444444444444445
Evaluatio

<h3>Taking Pattern As input to Produce more Recommendations

In [436]:
pattern=pd.read_csv('./rules.csv') # Rules/Pattern generated from the first milestone

<h3>Pre-Processing Pattern Data

In [437]:
pattern['antecedents']=pattern['antecedents'].str.replace(r'[\[\],]', '').str.replace("'","")

In [438]:
pattern['consequents']=pattern['consequents'].str.replace(r'[\[\],]', '').str.replace("'", "")

In [439]:
pat_arr=pattern[['antecedents','consequents']].values

In [440]:
pat_dict={row[0]: row[1] for row in pat_arr[0:]}

<h3>Adding new recommendations using pattern

In [441]:
# Recommending more Items based on patterns
top_n
for i in range(len(top_n)):
    for j in top_n[i]:
        comp=pat_dict.get(j)
        if ((comp!=None) and (comp not in top_n[i])):
                top_n[i].append(pat_dict.get(j))

In [444]:
with_pattern=pd.DataFrame(columns=['Member number','Recommendation With Pattern'])
count=0
for items in top_n:
    if len(items)>5:
        with_pattern=with_pattern.append({'Member number':df_number[count],'Recommendation With Pattern':items},ignore_index=True)
        print(df_number[count],items)
    count+=1

with_pattern.to_csv('with_pattern.csv',index=False)

1003 ['other vegetables', 'rolls/buns', 'root vegetables', 'bottled water', 'sausage', 'whole milk']
1005 ['rolls/buns', 'shopping bags', 'whipped/sour cream', 'brown bread', 'margarine', 'soda']
1014 ['whole milk', 'yogurt', 'tropical fruit', 'sausage', 'pip fruit', 'rolls/buns']
1016 ['other vegetables', 'rolls/buns', 'pip fruit', 'shopping bags', 'bottled beer', 'soda', 'whole milk']
1018 ['rolls/buns', 'yogurt', 'root vegetables', 'tropical fruit', 'sausage', 'whole milk']
1019 ['other vegetables', 'shopping bags', 'beef', 'hamburger meat', 'UHT-milk', 'soda']
1024 ['other vegetables', 'root vegetables', 'tropical fruit', 'chicken', 'grapes', 'rolls/buns', 'soda']
1025 ['yogurt', 'sausage', 'shopping bags', 'canned beer', 'coffee', 'whole milk', 'soda']
1029 ['canned beer', 'frankfurter', 'beef', 'fruit/vegetable juice', 'dessert', 'other vegetables']
1034 ['other vegetables', 'soda', 'tropical fruit', 'bottled water', 'bottled beer', 'whole milk']
1036 ['whole milk', 'root vegetab

<h3>Testing the recommendation on the testing set

In [363]:
df_test=pd.read_csv('./Groceries data test.csv')

In [364]:
df_test_group=df_test[['Member_number','itemDescription']]

In [365]:
test_group=df_test_group.groupby('Member_number').agg(lambda x: x.tolist())

<h3>Accuracy of the recommendation for every Customer 

In [366]:
r=[]
# To calculate the accuracy of the recommendation
for i,val in test_group.itemDescription.iteritems():
        if i in num_dict:
            list1 = top_n[num_dict.get(i)]
        list2 = val
        acutal_size=len(list2)
        list2 = [item for item in list2 if item not in list1]
        remind_size=acutal_size-len(list2)
        r.append((remind_size/acutal_size)*100)
        #percentage of items got from recommendation
        print(f"The Accuracy of Prediction for {i}:{(remind_size/acutal_size)*100}")
        

The Accuracy of Prediction for 1000:40.0
The Accuracy of Prediction for 1001:0.0
The Accuracy of Prediction for 1002:0.0
The Accuracy of Prediction for 1003:66.66666666666666
The Accuracy of Prediction for 1004:33.33333333333333
The Accuracy of Prediction for 1006:80.0
The Accuracy of Prediction for 1008:75.0
The Accuracy of Prediction for 1009:33.33333333333333
The Accuracy of Prediction for 1010:0.0
The Accuracy of Prediction for 1011:100.0
The Accuracy of Prediction for 1012:0.0
The Accuracy of Prediction for 1013:12.5
The Accuracy of Prediction for 1014:50.0
The Accuracy of Prediction for 1015:33.33333333333333
The Accuracy of Prediction for 1016:0.0
The Accuracy of Prediction for 1017:0.0
The Accuracy of Prediction for 1018:0.0
The Accuracy of Prediction for 1019:0.0
The Accuracy of Prediction for 1022:100.0
The Accuracy of Prediction for 1023:20.0
The Accuracy of Prediction for 1026:25.0
The Accuracy of Prediction for 1027:50.0
The Accuracy of Prediction for 1028:0.0
The Accuracy