In [None]:
# from IPython.display import HTML
# HTML('''<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js "></script><script>
# code_show=true; 
# function code_toggle() {
# if (code_show){
# $('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
# } else {
# $('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
# }
# code_show = !code_show
# } 
# $( document ).ready(code_toggle);</script><form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
# ''')

# Importation of Libraries & Defining Functions

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Custom
# from assistant import *

# Misc
import fim
import sqlite3
import re
import os
import joblib
from PIL import Image
from urllib.parse import urljoin
from tqdm.notebook import tqdm, trange

# NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Clustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import (KMeans,
                             AgglomerativeClustering,
                             DBSCAN, OPTICS, cluster_optics_dbscan)
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

from scipy.spatial.distance import euclidean, cityblock, cosine
from sklearn.metrics import (calinski_harabasz_score, davies_bouldin_score,
                             silhouette_score, adjusted_mutual_info_score,
                             adjusted_rand_score, confusion_matrix)

# sns.set_theme('notebook', 'darkgrid', 'colorblind')

## Global Helper Functions

In [None]:
def save_pkl(obj, name, prompt=True):
    """Save an object to a pickle file.
    """
    folder = 'pickles'
    ext = '.pkl'
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    if name[-4:] == ext:
        fp = os.path.join(folder, name)
    else:
        fp = os.path.join(folder, name+ext)
    joblib.dump(obj, fp)
    
    if prompt:
        print('Object pickled for future use.')
    
    return

def load_pkl(name, prompt=False):
    """Load an object from a pickle file.
    """
    folder = 'pickles'
    ext = '.pkl'
    if not os.path.exists(folder):
        raise ValueError("'pickles' folder does not exist.")
    
    if name[-4:] == ext:
        fp = os.path.join(folder, name)
    else:
        fp = os.path.join(folder, name+ext)
    pkl = joblib.load(fp)
    
    if prompt:
        print('Pickle file loaded.')
    
    return pkl


def lemmatize(series, stop_words, lemmatizer=WordNetLemmatizer()):
    """Return a pandas Series of the lemmatized review text data.
    """
    def clean_text(text, stop_words, lemmatizer):
        """Preprocess the text using lemmatization."""
        text = text.casefold()
        text_list = [
            lemmatizer.lemmatize(word)
            for word in re.findall(r'\b[a-z-]+\b', text)
            if word not in stop_words
        ]
        return ' '.join(text_list)

    lemmd_text = (
        series.apply(lambda x: clean_text(x, stop_words, lemmatizer))
    )
    
    return lemmd_text

def vectorize(corpus, params):
    """Vectorize the corpus of reviews using `model`.
    """
    vectorizer = TfidfVectorizer(**params)
    sparse_corpus = vectorizer.fit_transform(corpus)
    corpus_df = pd.DataFrame.sparse.from_spmatrix(
        sparse_corpus, columns=vectorizer.get_feature_names_out()
    )
    
    return corpus_df

def get_products():
    """Lemmatize and vectorize product titles for clustering
    """
    try:
        product_titles = load_pkl('product_titles')
    except:
        product_ids = load_pkl('product_ids')
        conn = sqlite3.connect(
            '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
        )
        q = f"""
            SELECT product_id, product_title
            FROM products
            WHERE product_id in {tuple(product_ids)}
        """
        product_titles = (
            pd.read_sql(q, conn).set_index('product_id').squeeze()
        )
        save_pkl(product_titles, 'product_titles')
        
    return product_titles

## Procedural Functions

1. `db`  : Database creation
1. `pr`  : Pruning customers and products
1. `vc`  : Creation of TFIDF matrix
1. `cl`  : Clustering

### Database Creation Functions

In [None]:
def db_get_files():
    """Gets the filepaths for the chosen categories of amazon data
    """
    amazon_dir = '/mnt/data/public/amazon-reviews/'

    # Get filepaths
    cat_paths = {}
    cat_list = ['Home', 'Home_Improvement', 'Furniture', 'Major_Appliances']
    
    for cat in os.listdir(amazon_dir):
        match = re.search(r'us_(.*)_v1', cat)
        if match is None:
            continue
        if match.group(1) in cat_list:
            cat_paths.update({
                match.group(1):
                urljoin(amazon_dir, cat)
            })
    return cat_paths


def db_create_products(cat_paths, conn):
    """Creates database of products
    """
    # Get products
    prod_list = {}
    for cat, path in cat_paths.items():
        prod_list.update(
            {cat:
            pd.read_csv(
                path,
                sep='\t',
                compression='gzip',
                on_bad_lines='skip',
                low_memory=False,
                usecols=['product_id', 'product_title', 'product_category'],
            )}
        )
    prod_df = pd.concat(list(prod_list.values()))
    
    # Drop rows w/ missing values and duplicated products
    prod_df.dropna(inplace=True)
    prod_df.drop_duplicates(subset=['product_id', 'product_title'],
                            inplace=True)
    prod_df.drop_duplicates(subset=['product_id'],
                            inplace=True)
    products = prod_df['product_id'].unique().tolist()
    
    prod_df.to_sql('products', conn, if_exists='replace', index=False)
    
    return products


def db_create_rates(cat_paths, conn, products):
    """Creates database of ratings
    """
    # Get ratings
    rate_list = {}
    for cat, path in cat_paths.items():
        rate_list.update(
            {cat:
            pd.read_csv(
                path,
                sep='\t',
                compression='gzip',
                on_bad_lines='skip',
                low_memory=False,
                usecols=['customer_id',
                         'product_id',
                         'review_id',
                         'star_rating'],
            )}
        )
    rate_df = pd.concat(list(rate_list.values()))
    rate_df = rate_df[rate_df['product_id'].isin(products)]
    rate_df.dropna(inplace=True)
    rate_df.drop_duplicates(subset=['product_id', 'review_id'],
                            inplace=True)
    drop_indices = ['2015-07-03', '2015-06-03', '2015-05-15', '2015-02-15',
                    '2014-12-03', '2014-11-17', '2014-09-01', '2014-08-09',
                    '2014-03-13', '2014-01-19', '2013-10-30']
    rate_df.drop(
        index=rate_df[rate_df['star_rating'].isin(drop_indices)].index,
        inplace=True
    )
    rate_df['star_rating'] = (
        rate_df['star_rating'].astype('float').astype('int')
    )
    rates = rate_df['review_id'].unique().tolist()
    
    rate_df.to_sql('ratings', conn, if_exists='replace', index=False)
    
    return rates
 

def db_create_reviews(cat_paths, conn, rates):
    """Creates database of reviews
    """
    # Get review content
    rev_list = {}
    for cat, path in cat_paths.items():
        rev_list.update(
            {cat:
            pd.read_csv(
                path,
                sep='\t',
                compression='gzip',
                on_bad_lines='skip',
                low_memory=False,
                usecols=['review_id',
                         'review_headline',
                         'review_body',
                         'review_date']
            )}
        )
    rev_df = pd.concat(list(rev_list.values()))
    rev_df = rev_df[rev_df['review_id'].isin(rates)]
    rev_df.to_sql('reviews', conn, if_exists='replace', index=False)
    
    
def db_create(cat_paths):
    """Creates a database based on the given categories and filepaths
    """
    conn = sqlite3.connect(
        '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
    )
    products = db_create_products(cat_paths, conn)
    rates = db_create_rates(cat_paths, conn, products)
#     db_create_reviews(cat_paths, conn, rates)

### Pruning Functions

In [None]:
def pr_prune_data(cust_thresh=62, prod_thresh=610):
    """Prune the number of customers and products to consider based on
    thresholds. Return a dataframe of reviews corresponding to considered
    customers and products only.
    """
    try:
        df_pruned = load_pkl(f'df_pruned_{cust_thresh}_{prod_thresh}')
    except:
        product_ids = load_pkl('product_ids')
        conn = sqlite3.connect(
            '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
        )
        query = f'''
            SELECT customer_id, product_id
            FROM ratings
            WHERE product_id in {tuple(product_ids)}
        '''
        ratings_df = pd.read_sql(query, conn)

        cust_count = ratings_df.customer_id.value_counts()
        prod_count = ratings_df.product_id.value_counts()

        considered_customers = (
            cust_count[cust_count > cust_thresh].index.tolist()
        )
        considered_products = (
            prod_count[prod_count > prod_thresh].index.tolist()
        )

        query = f'''
            SELECT
                ratings.customer_id,
                ratings.product_id,
                ratings.star_rating
            FROM ratings
            WHERE ratings.customer_id IN {tuple(considered_customers)}
        '''
        df_pruned = pd.read_sql(query, conn)
        save_pkl(df_pruned, f'df_pruned_{cust_thresh}_{prod_thresh}')
        
    return df_pruned

### Clustering Functions

In [None]:
def cl_get_features(product_titles, params):
    """Lemmatize `product_titles` and vectorize for clustering
    """
    # Lemmatize
    try:
        lemmd_titles = load_pkl('lemmd_titles')
    except:
        lemmd_titles = lemmatize(product_titles, params['stop_words'])
        save_pkl(lemmd_titles, 'lemmd_titles')
    
    # Vectorize
    try:
        product_profiles = load_pkl('product_profiles')
    except:
        product_profiles = vectorize(lemmd_titles, params)
        product_profiles.index = product_titles.index
        save_pkl(product_profiles, 'product_profiles')
        
    return product_profiles
            

def cl_cluster_range(tfidf_matrix, drop_list, n_clusters=9):
    """Cluster the input matrix using KMeans
    """
    try:
        cluster_dict = load_pkl('cluster_dict')
    except:
        tfidf_matrix = tfidf_matrix.drop(drop_list, axis=1)
        cluster_dict = {}
        for k in tqdm(range(2, n_clusters+2)):
            kmeans = KMeans(n_clusters=k, random_state=1337)
            cluster_labels = kmeans.fit_predict(tfidf_matrix)
            cluster_dict.update(
                {k: cluster_labels}
            )
        save_pkl(cluster_dict, 'cluster_dict')
    
    return cluster_dict


def cl_plot_all(cluster_dict, product_profiles, wc):
    """Plot a word cloud of the clusters.
    """
    for n_clusters, labels in cluster_dict.items():
        n_layers = (n_clusters // 5) + 1
        remainder = n_clusters % 5
        fig, ax = plt.subplots(n_layers, 5, figsize=(15, n_layers*3))
        ax = ax.flatten()
        
        ax[0].set_title(f'No of Clusters: {n_clusters}',
                        loc='left',
                        fontsize=16)
        for i in range(n_clusters):
            idx = np.argwhere(
                cluster_dict[n_clusters] == i
            ).flatten()
            filtered_weights = product_profiles.iloc[idx].sum()
            ax[i].imshow(wc.generate_from_frequencies(filtered_weights),
                         interpolation='spline16')
            ax[i].set_axis_off()
            
        for j in range(1, 6 - remainder):
            fig.delaxes(ax[-j])


def cl_plot_cluster(cluster_dict, profiles, n_clusters, wc,
                    cluster_names=None):
    """Plot a word cloud of the cluster."""
    fig, ax = plt.subplots(n_clusters, 2, figsize=(6.4*2, 4.8*n_clusters),
                           dpi=100)
    fig.subplots_adjust(wspace=0.4)
    ax = ax.flatten()
    for i, label in enumerate(np.unique(cluster_dict[n_clusters])):
        idx = np.argwhere(
            cluster_dict[n_clusters] == label
        ).flatten()
        filtered_weights = profiles.iloc[idx].sum()
        ax[2*i].imshow(wc.generate_from_frequencies(filtered_weights),
                       interpolation='spline16')
        if cluster_names is not None:
            ax[2*i].set_title(cluster_names[i])
        ax[2*i].set_axis_off()

        top_words = filtered_weights.sort_values(ascending=False).head(14)

        sns.barplot(x=top_words.to_numpy(), y=top_words.index, ax=ax[2*i+1],
                    color='gold')
        
        
def cl_plot_feat(tfidf_matrix, cluster_dict, num_feats=15, return_fig=False):
    """Project the features of the TF-IDF matrix onto the SVD components.
    """
    svd = TruncatedSVD(n_components=tfidf_matrix.shape[1],
                   random_state=1337)
    transformed_profiles = pd.DataFrame(
        svd.fit_transform(tfidf_matrix),
        index=tfidf_matrix.index
    )
    sv_idx = np.argsort(svd.explained_variance_ratio_)[::-1]
    transformed_profiles = transformed_profiles.iloc[:, sv_idx]
    
    p = svd.components_.T
    features = tfidf_matrix.columns
    fig, ax = plt.subplots(1, 2, figsize=(6.4*2, 4.8))

    sns.scatterplot(ax=ax[0], x=transformed_profiles.iloc[:, 0],
                    y=transformed_profiles.iloc[:, 1])
    ax[0].set(xlabel='SV1', ylabel='SV2')

    top_feats = np.argsort(tfidf_matrix.sum().to_numpy())[::-1][:num_feats]

    for feature, vec in zip(features[top_feats], p[top_feats]):
        ax[1].arrow(0, 0, vec[0], vec[1], lw=6, ec='none', fc='r')
        ax[1].text(vec[0], vec[1], feature, ha='center', color='r',
                   fontsize=12)
    
    if return_fig:
        return fig
    else:
        return transformed_profiles
    
    
def cl_plot_wc(cluster_dict, profiles, n_clusters, label, wc,
               cluster_name=None):
    """Plot a word cloud of the cluster.
    """
    fig = plt.figure(figsize=(6.4*2, 4.8*2))
    idx = np.argwhere(
        cluster_dict[n_clusters]['labels'] == label
    ).flatten()
    filtered_weights = profiles.iloc[idx].sum()
    plt.imshow(wc.generate_from_frequencies(filtered_weights),
                   interpolation='spline16')
    if cluster_name:
        plt.title(cluster_name)
    plt.axis('off')
    fig.canvas.draw()
    
    return fig

## Recommender System Functions

1. `fim` : Frequent-Itemset Mining (FIM-based) recommender system
1. `cb`  : Content-based recommender system
1. `pop`  : Popularity-based recommender system

###  FIM-based Functions

In [None]:
def fim_get_amazon():
    """Performs clustering on amazon-reviews data using FIM to establish item 
    profiles.
    """
    try:
        fim_df_raw = load_pkl('fim_df_raw')
        print('Pickle file loaded!')
    except:
        # Get data needed for transactions
        conn = sqlite3.connect(
            '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
        )
        product_ids = load_pkl('product_ids')
        q = f"""
            select 
                rate.review_id
                ,rate.customer_id
                ,rate.product_id
                ,rate.star_rating
                ,rev.review_date
            from 
                ratings as rate
                ,reviews as rev
            where
                rate.review_id = rev.review_id
                and rate.product_id in {tuple(product_ids)}
        """
        fim_df_raw = pd.read_sql(q, conn, parse_dates=['review_date'])
        save_pkl(fim_df_raw, 'fim_df_raw')
        
    return fim_df_raw


def fim_tune_period(df, n_reviews=None):
    """Plot number of max elements in a transaction & number of transactions 
    vs period resampling
    """
    try:
        fim_df_res = load_pkl('fim_df_res')
        print('Pickle file loaded!')
    except:
        periods = ['2W', '1M', '3M', '6M', 'Y']
        max_lens = []
        n_transactions = []
        single_itemsets = []
        for period in tqdm(periods):
            df_try = df.copy().iloc[:n_reviews]

            df_try['review_date'] = (
                df_try.loc[:, 'review_date'].dt.to_period(period)
            )
            df_try = (
                df_try.pivot_table(
                    index=['customer_id', 'review_date'],
                    columns='product_id',
                    values='star_rating',
                )
            )
            means = df_try.mean(axis=1)
            df_db = (df_try
                     .sub(means, axis=0)
                     .reset_index()
                     .melt(id_vars=['customer_id', 'review_date'],
                           value_name='star_rating'))
            mask = df_db['star_rating'] >= 0
            df_db = df_db[mask]
            df_db = (
                df_db.groupby(['customer_id', 'review_date'])
                ['product_id'].unique()
            )

            transaction_lens = df_db.apply(lambda x: len(x))

            single_itemsets.append((transaction_lens == 1).sum())
            max_lens.append(transaction_lens.max())
            n_transactions.append(df_db.shape[0])

        fim_df_res = pd.DataFrame(
            {'max_lens': max_lens,
             'n_transactions': n_transactions,
             'single_itemsets': single_itemsets,},
            index=periods
        )
        save_pkl(fim_df_res, 'fim_df_res')
        
    return fim_df_res


def fim_plot_tuning(df_res):
    """Plot the effects of period length to the different parameters of 
    transactions
    """
    fig, axs = plt.subplots(df_res.shape[1], 1,
                            figsize=(15, 4*df_res.shape[1]))
    fig.tight_layout(h_pad=5)
    titles = ['Maximum Number of Products in a Transaction',
              'Number of Transactions',
              'Number of Single-Itemset Transactions']
    for i, col in enumerate(df_res.columns):
        sns.lineplot(y=df_res[col], x=df_res.index, ax=axs[i])
        axs[i].set_title(titles[i], fontsize=15)
        

def fim_get_rules(df, supp, conf):
    """Get rules for the recommender system
    """
    try:
        fim_df = load_pkl(f'fim_df_{supp}_{conf}')
    except:
        for _ in trange(1):
            try:
                df_db = load_pkl('df_db')
                print('Transaction database loaded')
            except:
                print('Transaction database loading failed')
                df_db = (
                    df
                    .groupby(['customer_id'])['product_id']
                    .unique()
                    .apply(lambda x: list(x))
                    .tolist()
                )
                df_db = [x for x in df_db if len(x) > 1]
                save_pkl(df_db, 'df_db')
        for _ in trange(1):
            try:
                num = load_pkl('num')
                denom = load_pkl('denom')
            except:
                num = fim.fpgrowth(
                    df_db,
                    target='s',
                    zmin=2,
                    zmax=2,
                    supp=supp,
                    report='a'
                )
                save_pkl(num, 'num')
                denom = fim.fpgrowth(
                    df_db,
                    target='s',
                    zmin=1,
                    zmax=1,
                    supp=supp,
                    report='a'
                )
                save_pkl(denom, 'denom')

            dict_denom = {x[0][0]: x[1] for x in denom}
            results = []
            for itemset, value in num:
                if value/dict_denom[itemset[0]] >= conf/100:
                    results.append({
                        'antecedent': itemset[0],
                        'consequent': itemset[1],
                        'confidence': value/dict_denom[itemset[0]]
                    })
                elif value/dict_denom[itemset[1]] >= conf/100:
                    results.append({
                        'antecedent': itemset[1],
                        'consequent': itemset[0],
                        'confidence': value/dict_denom[itemset[1]]
                    })

        fim_df = (pd.DataFrame()
                  .from_dict(results)
                  .pivot_table(index='antecedent',
                               columns='consequent',
                               values='confidence',
                               fill_value=0))
        save_pkl(fim_df, f'fim_df_{supp}_{conf}')

    return fim_df

### Content-based Functions

In [None]:
def cb_get_user_profiles(df_user_rates, product_profiles, cust_thresh):
    """Get the user profiles of all users with n_rates above treshold
    """
    try:
#         user_profiles = load_pkl('jam_customer_profiles')
        user_profiles = load_pkl('user_profiles')
    except:
        mask = df_user_rates['n_rate'] > cust_thresh
        indices = df_user_rates[mask].index

        user_profiles = pd.DataFrame(index=indices,
                                     columns=product_profiles.columns)

        for cust_id in indices:
            prod_list = df_user_rates.loc[cust_id, 'product_id']

            user_profiles.loc[cust_id, :] = (
                product_profiles.loc[prod_list].mean(axis=0)
            )

        save_pkl(user_profiles, 'user_profiles')
        
    return user_profiles


def cb_get_util(df_prune):
    """Get utility matrix for content-based algorithms
    """
    df_util = df_prune.pivot_table(
        index='customer_id',
        columns='product_id',
        values='star_rating',
        aggfunc='mean'
    )
    
    return df_util

### Popularity-based Functions

In [None]:
def pop_get_weights(df_all):
    """Get weighted rates for each product
    """
    df_pivot = (
        df_all
        .groupby('product_id')['star_rating']
        .value_counts('rates')
        .rename('rates')
        .reset_index()
    )
    df_pivot['wRate'] = df_pivot['star_rating'] * df_pivot['rates']
    pop_df = df_pivot.groupby('product_id')['wRate'].sum()

    return pop_df

### Rating Checker Function

In [None]:
def get_user_rates():
    """Get the data of users with their list of rated items and number of 
    rated items
    """
    try:
        df_user_rates = load_pkl('df_user_rates')
    except:
        conn = sqlite3.connect(
            '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
        )
        product_ids = load_pkl('product_ids')
        q = f"""
            SELECT 
                rate.review_id
                ,rate.customer_id
                ,rate.product_id
                ,rate.star_rating
                ,rev.review_date
            FROM 
                ratings as rate
                ,reviews as rev
            WHERE
                rate.review_id = rev.review_id
                AND rate.product_id in {tuple(product_ids)} 
        """
        df = pd.read_sql(q, conn, parse_dates=['review_date'])
        df_user_rates = (
            df
            .groupby(['customer_id'])['product_id']
            .unique()
            .apply(lambda x: list(x))
            .to_frame()
        )
        df_user_rates['n_rate'] = df_user_rates['product_id'].apply(len)
        save_pkl(df_user_rates, 'df_user_rates')

    return df_user_rates

### RecSys360 Class Functions

In [None]:
class RecSys360():
    def __init__(self,
                 fim_df_raw,
                 product_profiles,
                 prod_clusters,
                 cust_thresh=62,
                 prod_thresh=610,
                 supp=-5,
                 conf=60):
        """Initialize Recommender System
        """
        self.cust_thresh = cust_thresh
        self.prod_thresh = prod_thresh
        self.supp = supp
        self.conf = conf
        self.df_prune = pr_prune_data()
        self.df_util = cb_get_util(self.df_prune)
        self.product_profiles = product_profiles
        self.df_user_rates = get_user_rates()
        self.user_profiles = cb_get_user_profiles(self.df_user_rates,
                                                  product_profiles,
                                                  cust_thresh=cust_thresh)
        
        self.fim_df = fim_get_rules(fim_df_raw, supp, conf)
        self.pop_df = pop_get_weights(pr_prune_data(cust_thresh=0,
                                                    prod_thresh=0))
        self.df_gen = pd.DataFrame(prod_clusters,
                                   columns=['labels'],
                                   index=product_profiles.index)
        
                
    def fim_recommend(self, user, L):
        """Recommend items based on established FIM rules based on the liked 
        value of a user
        """
        likes = self.df_user_rates.loc[user, 'product_id']
        w_rules = [item for item in likes if item in self.fim_df.index]
        df_reco = (
            self.fim_df
            .loc[w_rules]
            .melt()
            .set_index('consequent')
            .sort_values('value', ascending=False)
        )
        mask = df_reco['value'] > 0
        recos = df_reco[mask].index.to_list()[:L]

        return recos

        
    def cb_recommend(self, user, M):
        """
        Accepts a utility matrix and item profiles then recommends `L` items 
        to the user based on their user profile using cosine distance as a 
        measure. Sort them by item ID in case of equal distance.
        """
        unrated_idx = [
            idx for idx
            in self.df_util.loc[user][self.df_util.loc[user].isna()].index
            if idx in product_profiles.index
        ]
        distances = []
        user_profile = self.user_profiles.loc[user]
        for prod_id, row in (self.product_profiles
                             .loc[unrated_idx].iterrows()):
            if row.sum() == 0:
                continue
            distances.append(
                (prod_id,
                 cosine(user_profile, row))
            )
        recommended = sorted(distances, key=lambda x: (x[1], x[0]))[:M]
        recos = [id_ for id_, dist in recommended]

        return recos
    
    
    def pop_recommend(self, user, M):
        """Recommend stratified popular items
        """
        n_reco = M // self.df_gen['labels'].nunique()
        remainder = M % self.df_gen['labels'].nunique()
        recos = []
        for i in range(remainder):
            indices =  [idx for idx 
                        in self.df_gen[self.df_gen['labels'] == i].index
                        if idx in self.pop_df.index]
            recos.extend(
                self.pop_df.loc[indices]
                .sort_values(ascending=False)[:n_reco+1]
                .index
                .to_list()
            )

        for j in range(remainder, self.df_gen['labels'].nunique()):
            indices =  [idx for idx 
                        in self.df_gen[self.df_gen['labels'] == j].index
                        if idx in self.pop_df.index]
            recos.extend(
                self.pop_df[indices]
                .sort_values(ascending=False)[:n_reco]
                .index
                .to_list()
            )

        return recos[:M]
    
    def recommend(self, user, L=5, return_titles=True, print_dist=False, print_wc=False):
        """Implementation of hybrid recommender system.
        """
        recos = self.fim_recommend(user, L=L)
        
        n_rate = self.df_user_rates.loc[user, 'n_rate']
        M = L - len(recos)
        if n_rate > self.cust_thresh and M > 0:
            recos.extend(self.cb_recommend(user, M))
            if print_dist:
                print(f'Recommended Items:\n'
                      f'{L-M} FIM-based items, {M} CB-based items')

        elif n_rate <= self.cust_thresh and M > 0:
            recos.extend(self.pop_recommend(user, M))
            if print_dist:
                print(f'Recommended Items:\n'
                      f'{L-M} FIM-based items, {M} Popularity-based items')
        
        elif M <= L and print_dist:
            print(f'Recommended Items:\n'
                  f'{L-M} FIM-based items')
            
        if return_titles:
            conn = sqlite3.connect(
                '/mnt/processed/private/msds2023/lt5/dmw2-project/amazon.db'
            )
            query = f'''
                SELECT product_title
                FROM products
                WHERE product_id IN {tuple(recos)}
            '''
            return pd.read_sql(query, conn).squeeze().tolist()
        
        if print_wc:
            wc = WordCloud(
                width=800, height=800,
                mask=None,
                background_color='darkslategray',
                colormap='Wistia',
                max_words=400,
                collocations = False,
                stopwords=stop_words,
                random_state=1337,
            )
            
            user_profile = self.user_profiles.loc[user]
            with sns.axes_style('white'):
                plt.figure(figsize = (15, 4.8*2))
                plt.imshow(wc.generate_from_frequencies(user_profile),
                           interpolation='spline16')
                plt.title('User: {user}')
                plt.axis('off')
                plt.show()
                
                
        return recos

# Methodology

## Data Collection and Database Creation

Inserts text here

In [None]:
# # Get chosen categories' filepaths
# f_paths = db_get_files()
# # Run Database Creation Function
# db_create(f_paths)

## Data Preprocessing

In [None]:
# Get all available Products
product_titles = get_products()

In [None]:
# Define other drop words
drop_words = ['let', 'one', 'two', 'three', 'four', 'five', 'star', 'product',
              'year', 'inch', 'warranty', 'x', 'recommend', 'well', 'great',
              'size', 'expected', 'best', 'excellent', 'inch', 'pack', 'px',
              'pk', 'ct', 'quality', 'recommend', 'item', 'wonderful', 'size',
              'must', 'ordered', 'order', 'wanted', 'want', 'super', 'star',
              'purchase', 'useful', 'arrived', 'arrive', 'awesome', 'super',
              'fantastic', 'quite', 'definitely', 'worked', 'pleased', 'sure',
              'know', 'every', 'second', 'purchased', 'lb', 'pound', 'inch',
              'cleaner', 'ounce', 'oz', 'pack', 'product', 'quality']

drop_list = ['white', 'black', 'blue', 'red', 'home', 'set', 'piece', 'color']

# Instatiate stop words
stop_words = list(set(
    stopwords.words('english') +
    list(STOPWORDS) +
    drop_words
))

# Define params for tf-idf vectorization
new_params = dict(
    ngram_range=(1, 1),
    token_pattern=r'[a-z]{2,}',
    stop_words=stop_words,
    max_df=0.10,
    min_df=0.005,
    max_features=None
)

# Get product profiles
product_profiles = cl_get_features(product_titles, new_params)

## Clustering

In [None]:
# Get cluster labels for each k in range
cluster_dict = cl_cluster_range(product_profiles, drop_list)

In [None]:
# # Plot all wordclouds for each k-clustering
# wc = WordCloud(
#     font_path='AmazonEmberCdRC_Rg.ttf',
#     width=800, height=800,
#     background_color='white',
#     colormap='CMRmap_r',
#     stopwords=stop_words,
#     random_state=1337,
# )
# cl_plot_all(cluster_dict, product_profiles, wc)

In [None]:
# # Label clusters for n_clusters = 8
# n = 8
# cluster_names = ['Christmas Decorations',
#                  'Wood Furnitures',
#                  'Light Fixtures',
#                  'Bed Essentials',
#                  'Movie Posters',
#                  'Wall Decorations', 
#                  'Bed Features',
#                  'Kitchen Fixtures']
# cl_plot_cluster(cluster_dict,
#                 product_profiles.drop(drop_list, axis=1),
#                 n,
#                 wc,
#                 cluster_names=cluster_names)

## FIM Hyperparameter Tuning

Try to reduce number of transactions using `customer_id` and resampled `review_date`s as basis for transactions.

In [None]:
fim_df_raw = fim_get_amazon()
# fim_df_res = fim_tune_period(fim_df_raw, n_reviews=10_000)
# fim_plot_tuning(fim_df_res)

There is no significant effect in resampling `review_date`s, hence, we can simply treat each unique `customer_id` as a transaction of liked products.

## Recommender System

Initialize the RecSys360 system with the following necessary databases:

1. FIM Algorithm
    * Transaction Database
    * FIM Rules with hyperparameters: `supp` & `conf`


2. Content-based Algorithm
    * Item Profiles
    * User Profiles


3. Popularity-based Algorithm
    * Weighted product/item ratings

In [None]:
recsys360 = RecSys360(fim_df_raw, product_profiles, cluster_dict[8])

### FIM

In [None]:
recsys360.recommend(52228204, print_dist=True, print_wc=True)

### FIM-CB

In [None]:
recsys360.recommend(10804961, print_dist=True)

### FIM-Popularity

In [None]:
recsys360.recommend(10032, print_dist=True)