## FP Growth - Algorithm

### Imports & Loading preprocessed files

In [81]:
# import files
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pickle


In [4]:
# load processed books info
books_df = pd.read_csv('preprocessed_data/books.csv')
 
# load processed reviews
reviews_df = pd.read_csv('preprocessed_data/reviews.csv')


### Calculating FP Growth Association

In [62]:
def calc_review_counts(df):
    
    """
    Calculate the number of reviews per book and add it as a new column.

    Args:
        df (pd.DataFrame): DataFrame containing at least a 'title' column.

    Returns:
        pd.DataFrame: Same DataFrame with an added 'count' column showing
                      how many total reviews each book has.
    """

    df['count'] = df.groupby('title')['title'].transform('count')
    return df


def fpgrowth_algorithm(reviews_df):

    """
    Generate book association rules using the FP-Growth algorithm.

    Args:
        reviews_df (pd.DataFrame):
            A DataFrame containing at minimum:
                - user_id: identifier for users
                - title: book title
                - score: rating given by the user

    Returns:
        pd.DataFrame:
            A DataFrame of association rules, containing:
                - antecedents: the input book
                - consequents: recommended book(s)
                - support: frequency of itemset
                - confidence: likelihood of co-occurrence
                - lift: strength of association
                (and other metrics generated by association_rules)
    """

    df = reviews_df[['user_id', 'title', 'score']].copy()

    # Mark as read
    df['read'] = (df['score'] >= 4.0).astype(int)

    df = df[df['user_id'].isin(
    df[df['read'] == 1].groupby('user_id').size()
        .pipe(lambda s: s[s>=2].index)
    )]


    # Add review counts
    df = calc_review_counts(df)

    # Filter books with >100 reviews
    df = df[df['count'] > 75]

    # Build transactions: group by user → set of books they "read"
    pivot = df.pivot_table(index='user_id', columns='title', values='read', fill_value=0)

    # FP-growth
    frequent_itemsets = fpgrowth(pivot.astype(bool), min_support=0.0005, use_colnames=True)

    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.3)

    # # filter 1→1 rules
    # rules = rules[
    #     rules['antecedents'].apply(lambda x: len(x) == 1) &
    #     rules['consequents'].apply(lambda x: len(x) == 1)
    # ]

    return rules


In [63]:
rules = fpgrowth_algorithm(reviews_df)
books_associations = rules[['antecedents', 'consequents']]
#books_associations = books_associations.rename(columns={'antecedents': 'title', 'consequents': 'titles'})

### Saving rules using pickle

In [82]:
# Save rules to a file
with open('rules.pkl', 'wb') as f:
    pickle.dump(rules, f)


### Loading pickle and recommending books

In [83]:
# Later, load them back
with open('rules.pkl', 'rb') as f:
    rules = pickle.load(f)


In [84]:
def recommend_books(rules, book_name, rating):
    """
    Recommend books based on association rules.

    Args:
        rules (pd.DataFrame): DataFrame of association rules (from fpgrowth_algorithm)
        book_name (str): Book the user has read/rated
        rating (float): User rating for the book

    Returns:
        list: List of recommended book names (from consequents)
    """

    # Only recommend if rating >= 4
    if rating < 4:
        return []

    # Filter rules where the given book is in the antecedents
    matched_rules = rules[rules['antecedents'].apply(lambda x: book_name in x)]

    # Get all books in consequents (flatten the sets)
    recommended_books = set()
    for conseq in matched_rules['consequents']:
        recommended_books.update(conseq)

    # Remove the original book if it's somehow included
    recommended_books.discard(book_name)

    return list(recommended_books)


In [92]:
recommend_books(rules,'THE CATCHER IN THE RYE', 4)

['To Kill A Mockingbird',
 'The Old Man and the Sea',
 "The grapes of wrath (The Collector's library of the world's best-loved books)",
 'Animal Farm (Signet Classics, CT304)',
 'Night',
 'Fahrenheit 451',
 '1984',
 'Lord of the flies',
 'Fiesta [The Sun Also Rises].',
 'Of Mice and Men (Penguin Audiobooks)',
 "Slaughterhouse-Five : Or the Children's Crusade",
 "Harry Potter and The Sorcerer's Stone",
 'The Great Gatsby']

### Used to see how data looks and so on...

In [93]:
i = rules[rules['antecedents'].apply(lambda x: "THE CATCHER IN THE RYE" in x)]
i[['antecedents','consequents']]

Unnamed: 0,antecedents,consequents
7,(THE CATCHER IN THE RYE),(Night)
39,(THE CATCHER IN THE RYE),(The Great Gatsby)
45,(THE CATCHER IN THE RYE),(Fahrenheit 451)
51,(THE CATCHER IN THE RYE),(The Old Man and the Sea)
148,(THE CATCHER IN THE RYE),(Of Mice and Men (Penguin Audiobooks))
163,(THE CATCHER IN THE RYE),(The grapes of wrath (The Collector's library ...
171,(THE CATCHER IN THE RYE),(Harry Potter and The Sorcerer's Stone)
173,(THE CATCHER IN THE RYE),(To Kill A Mockingbird)
175,(THE CATCHER IN THE RYE),(1984)
237,(THE CATCHER IN THE RYE),"(Animal Farm (Signet Classics, CT304))"


### Code explanation

def calc_review_counts(df):

    # groups df by title, counts how many times a book appears in a df
    # uses transform('count') so that every row for the same book gets the same count
    df['count'] = df.groupby('title')['title'].transform('count')
    return df


def fpgrowth_algorithm(reviews_df):

    """
    Generate book association rules using the FP-Growth algorithm.

    Args:
        reviews_df (pd.DataFrame):
            A DataFrame containing at minimum:
                - user_id: identifier for users
                - title: book title
                - score: rating given by the user

    Returns:
        pd.DataFrame:
            A DataFrame of association rules, containing:
                - antecedents: the input book
                - consequents: recommended book(s)
                - support: frequency of itemset
                - confidence: likelihood of co-occurrence
                - lift: strength of association
                (and other metrics generated by association_rules)
    """

    # keeping only the relevant columns
    df = reviews_df[['user_id', 'title', 'score']].copy()

    # Mark as read, 1 -> liked it ; 0 -> otherwise
    df['read'] = (df['score'] >= 4.0).astype(int)
    # This converts ratings into binary transactions for FP-Growth.

    #keeps users with atleast 2 'read' books
    df = df[df['user_id'].isin(
    df[df['read'] == 1].groupby('user_id').size()
        .pipe(lambda s: s[s>=2].index)
    )]


    # Add review counts
    df = calc_review_counts(df)

    # Filter books with >75 reviews
    df = df[df['count'] > 75]

    # Build transactions: group by user → set of books they "read"
    # rows is the user_id; columns is the title; value = read/liked

    pivot = df.pivot_table(index='user_id', columns='title', values='read', fill_value=0)

    # FP-growth
    #converts the 1 and 0 to boolean; 
    # min_support = Minimum fraction of users that must have read the same
    # returns book names in the dataset columns = 'title'
    frequent_itemsets = fpgrowth(pivot.astype(bool), min_support=0.0005, use_colnames=True)

    # converts frequent itemsets into rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)


    return rules
