In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from apyori import apriori
warnings.filterwarnings('ignore')
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [6]:


class FrequentPatterns:
    # Converting given dataframes into suitable format for modelling
    def dataPreparation(df):
        # Create a result dataframe
        result_df = pd.DataFrame(columns=['member_number', 'date', 'items'])

        # Group the data by member_number and date
        grouped_df = df.groupby(['Member_number', 'Date'])

        # Iterate over each group
        for name, group in grouped_df:
            # Get the items for each group
            items = group['itemDescription'].values

            # Append the result to the result dataframe
            result_df = pd.concat([result_df, pd.DataFrame({'member_number': [name[0]], 'date': [name[1]], 'items': [items]})], ignore_index=True)
    
        # Drop the date column, member_number columns
        result_df.drop(['date', 'member_number'], axis=1, inplace=True)

        # Apriori library requires data to be list of lists
        resultList = []
        for i in range(len(result_df)):
            resultList.append(result_df['items'][i])

        # return the result list
        return resultList
    
    def trainData(df_train, support_threshold, confidence_threshold, lift_threshold, min_length):
        association_rules = apriori(df_train, min_support=support_threshold, min_confidence=confidence_threshold, min_lift=lift_threshold, min_length=min_length)
        association_results = list(association_rules)
        return association_results
    
    def convertResultsToDf(association_results):
        # result df
        result_df = pd.DataFrame(columns=['rules', 'antecedents', 'consequents', 'support', 'confidence', 'lift', 'test_antecedentAppeared', 'test_followedRule', 'test_accuracy'])
    
        # Iterate over each rule
        for item in association_results:
            rules = [x for x in item[0]]
            support = item[1]
            antecedents = [x for x in item[2][0][0]]
            consequents = [x for x in item[2][0][1]]
            confidence = item[2][0][2]
            lift = item[2][0][3]

            # Append the result to the result dataframe
            new_row = pd.DataFrame({'rules': [rules], 'antecedents': [antecedents], 'consequents': [consequents], 'support': [support], 'confidence': [confidence], 'lift': [lift], 'test_antecedentAppeared': [0], 'test_followedRule': [0], 'test_accuracy': [0]})
            result_df = pd.concat([result_df, new_row], ignore_index=True)
    
        # Return the result dataframe
        return result_df

    def testData(df_test, association_results, association_results_df):
        # Initialize variables for counting correct predictions and total instances
        correct_predictions = 0
        i = 0

        # Iterate over the association rules generated by the Apriori algorithm
        for item in association_results:
            antecedents = [x for x in item[2][0][0]]
            consequents = [x for x in item[2][0][1]]
            support = item[1]
            confidence = item[2][0][2]
            lift = item[2][0][3]
    
            # Count the number of times the association rule appears in the test dataset
            count = 0
            total_instances = 0
            for transaction in df_test:
                if set(antecedents).issubset(set(transaction)) and set(consequents).issubset(set(transaction)):
                    count += 1
                    total_instances += 1
                elif (set(antecedents).issubset(set(transaction)) and (len(antecedents)!=0)):
                    total_instances += 1
                
    
            # Update the variables for counting correct predictions and total instances
            correct_predictions += count

            # update the test accuracy in the result dataframe
            association_results_df['test_antecedentAppeared'][i] = total_instances
            association_results_df['test_followedRule'][i] = count
            association_results_df['test_accuracy'][i] = (count/total_instances)

            i=i+1
    
        return association_results_df
    
    def getPatterns(df_train, df_test, support_threshold, confidence_threshold, lift_threshold, min_length):
        # Convert the train and test data into suitable format
        df_train = FrequentPatterns.dataPreparation(df_train)
        df_test = FrequentPatterns.dataPreparation(df_test)

        # Train the model
        association_results = FrequentPatterns.trainData(df_train, support_threshold, confidence_threshold, lift_threshold, min_length)

        # Convert the results into dataframe
        association_results_df = FrequentPatterns.convertResultsToDf(association_results)

        # Test the model
        association_results_df = FrequentPatterns.testData(df_test, association_results, association_results_df)
        association_results_df = association_results_df[association_results_df['antecedents'].apply(len) > 0].reset_index(drop=True)

        return association_results_df

In [7]:
warnings.filterwarnings("ignore")
class Recommendation:
    def __init__(self, data_path=['./Groceries_data_train.csv','./Groceries data test.csv'], similarity_matrix_path=None):
        self.df = pd.concat([pd.read_csv(file_path) for file_path in data_path])
        self.df_item = self.df['itemDescription']
        self.df_number = sorted(self.df.Member_number.unique().tolist())
        self.group = self.df.groupby('Member_number')['itemDescription'].agg(lambda x: x.tolist()).to_frame()
        self.dictionary = self.df.groupby('itemDescription').apply(lambda dfg: dfg.shape[0]).to_dict()
        self.num_dict = {number: index for index, number in enumerate(self.df_number)}
        self.item_dict = dict(zip(self.df_item.unique(), np.arange(len(self.df_number) - 1)))
        self.purchase_matrix = self._create_purchase_matrix()
        if similarity_matrix_path:
            self.similarity_matrix = self._load_similarity_matrix(similarity_matrix_path)
        else:
            self.similarity_matrix = self._calculate_similarity_matrix()
    
    def _create_purchase_matrix(self):
        df_utility = pd.DataFrame(columns=self.df_item.unique())
        for number in self.df_number:
            row = [1 if item in self.group.itemDescription[number] else 0 for item in df_utility.columns]
            df_utility.loc[number] = row
        return csr_matrix(df_utility.values)
    
    def _calculate_similarity_matrix(self):
        num_customers = self.purchase_matrix.shape[0]
        similarity_matrix = np.zeros((num_customers, num_customers))

        for i in range(num_customers):
            purchase_vector_i = self.purchase_matrix[i]
            similarities = cosine_similarity(purchase_vector_i, self.purchase_matrix)
            similarity_matrix[i] = similarities[0]
        return csr_matrix(similarity_matrix)
  
    
    def get_recommendation(self, customer_number, k,method_number):
        a = self.num_dict.get(customer_number)
        similarities = self.similarity_matrix[a].toarray().flatten()
        sort_in = np.argsort(similarities)[::-1]
        neighbors = sort_in[1:k+1]
        
        recommendation_set = set()
        for items in neighbors:
            items_list = self.group.iloc[items].values.tolist()
            for sublist in items_list:
                recommendation_set.update(sublist)
        
        recommendations = list(recommendation_set)
        frequency = [self.dictionary.get(item) for item in recommendations]
        
        sorted_recommendations = [x for _, x in sorted(zip(frequency, recommendations), reverse=True)][:k]
        if method_number==1:
            return sorted_recommendations
        else:
            train = pd.read_csv('./Groceries_data_train.csv')
            test = pd.read_csv('./Groceries data test.csv')
            # rules_df = FrequentPatterns.getPatterns(train, test, 0.001, 0.1, 1, 2)
            # rules_df = FrequentPatterns.getPatterns(train, test, 0.0009, 0.1, 1, 2)
            # rules_df = FrequentPatterns.getPatterns(train, test, 0.0009, 0.08, 1, 2)
            pattern = FrequentPatterns.getPatterns(train,test, 0.0009, 0.08, 0.9, 2)    
            pat_arr=pattern[['antecedents','consequents']].values
            pat_dict = {tuple(row[0]): row[1] for row in pat_arr}
            # Recommending more Items based on patterns
            for j in sorted_recommendations:
                comp=pat_dict.get(j)
                if ((comp!=None) and (comp not in sorted_recommendations)):
                        sorted_recommendations.append(pat_dict.get(j))
            return sorted_recommendations


In [8]:
rec=Recommendation()

In [13]:
rec.get_recommendation(1001,10,1)

['whole milk',
 'other vegetables',
 'rolls/buns',
 'soda',
 'yogurt',
 'tropical fruit',
 'bottled water',
 'sausage',
 'citrus fruit',
 'pastry']

In [14]:
print(rec.get_recommendation(1001,10,2))

['whole milk', 'other vegetables', 'rolls/buns', 'soda', 'yogurt', 'tropical fruit', 'bottled water', 'sausage', 'citrus fruit', 'pastry']
