In [43]:
import pandas as pd

# Load the ratings dataset
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

# Filter ratings above 2
filtered_ratings_df = ratings_df[ratings_df['rating'] > 2]

# Group by 'userId' and count the number of movies rated by each user
user_movie_counts = filtered_ratings_df.groupby('userId')['movieId'].count().reset_index()
user_movie_counts.columns = ['userId', 'num_movies_rated']

# Filter users who have rated more than 10 movies
active_users = user_movie_counts[user_movie_counts['num_movies_rated'] > 10]

# Filter the transactional data based on active users
transactional_data = filtered_ratings_df[filtered_ratings_df['userId'].isin(active_users['userId'])]

# Group by 'userId' and aggregate movies
transactional_data = transactional_data.groupby('userId')['movieId'].apply(list).reset_index()
transactional_data.columns = ['userId', 'movies_rated_above_2']

     userId                               movies_rated_above_2
0         1  [1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,...
1         2  [318, 333, 1704, 3578, 6874, 8798, 46970, 4851...
2         3  [849, 1275, 1371, 1587, 2288, 2851, 3024, 3703...
3         4  [21, 45, 52, 58, 106, 125, 162, 171, 176, 215,...
4         5  [1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232...
..      ...                                                ...
602     606  [1, 7, 11, 15, 17, 18, 28, 29, 32, 36, 46, 47,...
603     607  [1, 11, 25, 34, 36, 86, 110, 150, 153, 165, 18...
604     608  [1, 10, 16, 21, 31, 32, 34, 39, 47, 50, 70, 88...
605     609  [1, 10, 110, 116, 137, 150, 161, 185, 208, 231...
606     610  [1, 6, 16, 32, 47, 50, 70, 95, 110, 111, 112, ...

[607 rows x 2 columns]


In [44]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Split data into training and test sets
train_data = []
test_data = []

for _, row in transactional_data.iterrows():
    user_id = row['userId']
    movies_rated_above_2 = row['movies_rated_above_2']
    
    train_movies, test_movies = train_test_split(movies_rated_above_2, test_size=0.2, random_state=42)
    
    train_data.append({'userId': user_id, 'movies_rated_above_2': train_movies})
    test_data.append({'userId': user_id, 'test_movies': test_movies})

# Create DataFrames for training and test data after the loop
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

Training Data:
     userId                               movies_rated_above_2
0         1  [1136, 1031, 2054, 2090, 3450, 1270, 1408, 151...
1         2  [86345, 106782, 71535, 77455, 80489, 333, 6874...
2         3  [1587, 6835, 26409, 7991, 5764, 1371, 5181, 70...
3         4  [4273, 52, 2174, 3083, 1265, 3317, 937, 593, 3...
4         5  [608, 58, 474, 153, 261, 588, 592, 150, 300, 4...
..      ...                                                ...
602     606  [2181, 3640, 8964, 31309, 30707, 632, 4310, 30...
603     607  [2571, 3020, 3476, 724, 2995, 2114, 1291, 3109...
604     608  [1831, 4874, 7445, 383, 1193, 3082, 196, 1265,...
605     609  [592, 339, 208, 329, 292, 892, 231, 454, 1, 83...
606     610  [71341, 1240, 76077, 116799, 122900, 6063, 107...

[607 rows x 2 columns]

Test Data:
     userId                                        test_movies
0         1  [157, 2947, 2028, 3441, 2387, 3639, 2916, 1278...
1         2          [60756, 115713, 58559, 99114, 318, 74458]
2   

In [48]:
# import pandas as pd

# dataset = {
#     "userId": ["1", "2", "3", "4", "5"],
#     "movies_rated_above_2": [[5, 11, 13, 14, 15, 25],
#                [4, 5, 11, 14, 15, 25],
#                [1, 5, 11, 13],
#                [3, 11, 13, 21, 25],
#                [3, 5, 9, 11, 15]]
# }

# train_df = pd.DataFrame(dataset)

In [50]:
# Create an empty dictionary to store movie support
movie_support = {}

# Iterate through each user's movies in the training set
for _, row in train_df.iterrows():
    train_movies = row['movies_rated_above_2']
    
    # Count the occurrences of each movie and update support
    for movie in train_movies:
        if movie in movie_support:
            movie_support[movie] += 1
        else:
            movie_support[movie] = 1

# Convert the movie support dictionary to a DataFrame
support_df = pd.DataFrame(list(movie_support.items()), columns=['movieId', 'support'])

# Sort the DataFrame by support in descending order
support_df = support_df.sort_values(by='support', ascending=False)

Movie Support:


Unnamed: 0,movieId,support
344,318,250
172,356,238
603,296,234
116,593,215
58,2571,215
...,...,...
5590,6705,1
5591,55729,1
5599,52867,1
5600,5401,1


In [51]:
# Define the minimum support threshold
minsup = 8

# Filter the DataFrame to keep only movies with support >= minsup
trimmed_support_df = support_df[support_df['support'] >= minsup]

Trimmed Support DataFrame (minsup=15):


Unnamed: 0,movieId,support
344,318,250
172,356,238
603,296,234
116,593,215
58,2571,215
...,...,...
2963,4388,15
2050,69,15
1166,933,15
45,1224,15


In [52]:

# Filter movies in train_df based on minsup
train_df['movies_rated_above_2'] = train_df['movies_rated_above_2'].apply(
    lambda movies: [movie for movie in movies if movie in trimmed_support_df['movieId'].values]
)

# Function to get the support count of a movie
def get_support_count(movie):
    return trimmed_support_df[trimmed_support_df['movieId'] == movie]['support'].values[0]

# Sort movies by support count in descending order
train_df['movies_rated_above_2'] = train_df['movies_rated_above_2'].apply(
    lambda movies: sorted(movies, key=get_support_count, reverse=True)
)


Modified Training Data (sorted by support count):
     userId                               movies_rated_above_2
0         1  [356, 2571, 593, 480, 110, 2959, 527, 2858, 11...
1         2  [1704, 3578, 79132, 6874, 48516, 68157, 109487...
2         3                           [2288, 3703, 1371, 1587]
3         4  [593, 1196, 2858, 1198, 588, 457, 2762, 608, 1...
4         5  [318, 110, 527, 150, 589, 50, 588, 592, 457, 6...
..      ...                                                ...
602     606  [296, 2571, 593, 260, 480, 110, 1196, 2858, 11...
603     607  [2571, 593, 260, 110, 527, 1196, 150, 2858, 58...
604     608  [318, 356, 296, 593, 2571, 260, 110, 2959, 119...
605     609  [318, 480, 110, 150, 589, 592, 457, 1, 590, 10...
606     610  [318, 356, 296, 593, 2571, 260, 480, 110, 2959...

[607 rows x 2 columns]


In [53]:
# Define a function to get the support count of a movie from the trimmed_support_df
def get_support(movie_id):
    support_row = trimmed_support_df[trimmed_support_df['movieId'] == movie_id]
    if not support_row.empty:
        return support_row['support'].values[0]
    else:
        return 0  # Default support count for movies not found in trimmed_support_df

# Sort the movies_rated_above_2 column in train_df based on trimmed_support count
train_df['movies_rated_above_2'] = train_df['movies_rated_above_2'].apply(
    lambda movies: sorted(movies, key=get_support, reverse=True)
)

Modified Training Data (sorted by trimmed_support count):


Unnamed: 0,userId,movies_rated_above_2
0,1,"[356, 2571, 593, 480, 110, 2959, 527, 2858, 11..."
1,2,"[1704, 3578, 79132, 6874, 48516, 68157, 109487..."
2,3,"[2288, 3703, 1371, 1587]"
3,4,"[593, 1196, 2858, 1198, 588, 457, 2762, 608, 1..."
4,5,"[318, 110, 527, 150, 589, 50, 588, 592, 457, 6..."
...,...,...
602,606,"[296, 2571, 593, 260, 480, 110, 1196, 2858, 11..."
603,607,"[2571, 593, 260, 110, 527, 1196, 150, 2858, 58..."
604,608,"[318, 356, 296, 593, 2571, 260, 110, 2959, 119..."
605,609,"[318, 480, 110, 150, 589, 592, 457, 1, 590, 10..."


In [54]:
class TrieNode:
    def __init__(self):
        self.char = -1
        self.count = 0
        self.children = []

root = TrieNode()



In [55]:
def insert(word):
    node = root
    for char in word:
        found = False
        for child in node.children:
            if child.char == char:
                node = child
                node.count += 1
                found = True
                break

        if not found:
            new_node = TrieNode()
            new_node.char = char
            new_node.count = 1
            node.children.append(new_node)
            node = new_node

In [56]:
for _, row in train_df.iterrows():
    transaction = row['movies_rated_above_2']
    insert(transaction)

In [58]:
condPatBase = {}
def dfs(node, pattern, item, patBase):
    if node.char == item:
        patBase[tuple(pattern.copy())] = node.count
        return

    for child in node.children:
        new_pattern = pattern.copy()
        if node.char != -1:
            new_pattern.append(node.char)
        dfs(child, new_pattern, item, patBase)

In [59]:
for _, trimmed_row in trimmed_support_df.iterrows():
    pattern = []
    patBase = {}
    movie_id = trimmed_row['movieId']  # Use the correct column name 'movieId'
    dfs(root, pattern, int(movie_id), patBase)
    condPatBase[int(movie_id)] = patBase.copy()


In [61]:
conditional_fp_trees = []

for movie_id, patterns in condPatBase.items():
    count = 0
    intersection_list = []
    for idx, (pattern, support) in enumerate(patterns.items()):
        count += support
        if idx == 0:
            intersection_list = list(pattern)
        else:
            list2 = intersection_list.copy()
            intersection_list = [item for item in pattern if item in list2]
    if len(intersection_list) != 0:
        conditional_fp_trees.append(
            {movie_id : {
                tuple(intersection_list) : count
            }}
        )

[{41997: {(4226,): 16}}, {82461: {(7153,): 15}}]


In [63]:
import itertools
def findsubsets(s, n):
    return list(itertools.combinations(s, n))
 
frequent_pattern_itemsets = []

for cond_fp_tree in conditional_fp_trees:
    for movie_id, patterns in cond_fp_tree.items():
        for pattern, count in patterns.items():
            # print(pattern, count)
            subset = []
            l = len(pattern)
            for n in range(1, l+1):
                subset.extend(list(findsubsets(pattern, n)))
            # print(movie_id)
            subset = [{tuple(list(item) + [movie_id]): count} for item in subset]
            frequent_pattern_itemsets.append(
                {
                    movie_id : tuple(subset)
                }
            )

print(frequent_pattern_itemsets)

[{41997: ({(4226, 41997): 16},)}, {82461: ({(7153, 82461): 15},)}]
