In [1]:
import polars as pl

In [2]:
train_data = pl.read_parquet('data/otto-reduced/train.parquet')
# test_data = pl.read_parquet('otto-reduced/test.parquet')
print(train_data)

shape: (3_340_193, 2)
┌─────────┬───────────────────────────────────┐
│ session ┆ events                            │
│ ---     ┆ ---                               │
│ i64     ┆ list[struct[3]]                   │
╞═════════╪═══════════════════════════════════╡
│ 1185618 ┆ [{1020908,1659386928785,"clicks"… │
│ 7575896 ┆ [{1440931,1660340322363,"clicks"… │
│ 4293917 ┆ [{1679269,1659737781324,"clicks"… │
│ 462188  ┆ [{69116,1659351401895,"clicks"},… │
│ 1028060 ┆ [{1540638,1659380043176,"clicks"… │
│ …       ┆ …                                 │
│ 4365092 ┆ [{1361831,1659769672889,"clicks"… │
│ 2210007 ┆ [{1508734,1659498037473,"clicks"… │
│ 7272969 ┆ [{636101,1660301315572,"clicks"}… │
│ 7569078 ┆ [{320776,1660338958474,"clicks"}… │
│ 7712086 ┆ [{801774,1660384723701,"clicks"}… │
└─────────┴───────────────────────────────────┘


In [3]:
pl.Config.set_fmt_str_lengths(1000)
# print(test_data)


polars.config.Config

In [4]:
from polars.datatypes import Struct

class Recommender:
    def fit(self, data: pl.DataFrame) -> None:
        # fit the model to the training data
        pass
        
    def recommend(self, events: list[Struct]) -> list[int]:
        # return a list of k item ids
        pass

In [5]:
class BaselineRecommender(Recommender):
    def __init__(self):
        self.top_k = None
        
    def fit(self, data):
        data = data.explode( "events" ).unnest( "events" )
        print(data)
        self.top_k = data.group_by("aid").len().sort("len", descending=True).head(20)["aid"].to_list()
        
    def recommend(self, events):
        return self.top_k


bs_model = BaselineRecommender()
bs_model.fit(train_data)
print(bs_model.recommend([1, 2, 3]))

shape: (68_667_257, 4)
┌─────────┬─────────┬───────────────┬────────┐
│ session ┆ aid     ┆ ts            ┆ type   │
│ ---     ┆ ---     ┆ ---           ┆ ---    │
│ i64     ┆ i64     ┆ i64           ┆ str    │
╞═════════╪═════════╪═══════════════╪════════╡
│ 1185618 ┆ 1020908 ┆ 1659386928785 ┆ clicks │
│ 1185618 ┆ 1356523 ┆ 1659386944620 ┆ clicks │
│ 1185618 ┆ 130973  ┆ 1659387015519 ┆ clicks │
│ 1185618 ┆ 1392955 ┆ 1659387070030 ┆ clicks │
│ 1185618 ┆ 199292  ┆ 1659387078709 ┆ clicks │
│ …       ┆ …       ┆ …             ┆ …      │
│ 7712086 ┆ 1464627 ┆ 1660386545258 ┆ clicks │
│ 7712086 ┆ 1145198 ┆ 1660386575785 ┆ clicks │
│ 7712086 ┆ 1734182 ┆ 1660386592436 ┆ clicks │
│ 7712086 ┆ 627378  ┆ 1660386603044 ┆ clicks │
│ 7712086 ┆ null    ┆ 1661330073456 ┆ end    │
└─────────┴─────────┴───────────────┴────────┘
[None, 1460571, 29735, 108125, 231487, 1733943, 1502122, 756588, 832192, 986164, 670006, 1586171, 322370, 409620, 166037, 620545, 77440, 554660, 801774, 184976]


In [6]:
def evaluation(model, test_data):
    # evaluate the model on the test data
    right_predictions = 0
    i = 0
    print(len(test_data))
    for sequence in test_data.iter_rows():
        if i % 100000 == 0:
            print(i)
        recommendations = model.recommend(sequence[2])
        label = sequence[1]
        if label in recommendations:
            right_predictions += 1
        i += 1
        
    print(f"Accuracy: {right_predictions / len(test_data) * 100} %")

# evaluation(bs_model, test_data)

In [7]:
# smaller_train_data = train_data.head(1000)

In [8]:
from fastai.collab import CollabDataLoaders, collab_learner

class FastAIRecommender(Recommender):
    def __init__(self):
        print("init")
        recommender = None

    def fit(self, data: pl.DataFrame) -> None:
        data = data.explode( "events" ).unnest( "events" )
        print("exploded")
        data = data.to_pandas()
        print("to pandas")
        dls = CollabDataLoaders.from_df(data, bs=64)
        print("dls")
        learn = collab_learner(dls, y_range=(0, 1))
        print("learner")
        print(learn.fit_one_cycle(1))
    
    def get_next_click(self, session_history):
        """
        Predicts the next product click for a given session history.
        """
        user_embedding = model.user_factors[sessions.index(session_history)]
        scores = model.item_factors @ user_embedding.T
        _, prediction_idx = scores.topk(1)  # Get index of most probable item
        return prediction_idx.item()  



In [9]:
# FastAIRecommender().fit(smaller_train_data)

init


NameError: name 'smaller_train_data' is not defined

In [10]:
def transform_train_data_to_session_sequences(train_data):
    output = []
    print("Train len: ", len(train_data))
    i = 0
    for row in train_data.iter_rows():
        if i % 100000 == 0:
            print(i)
        row_output = []
        events = row[1]
        for event in events:
            row_output.append(event["aid"])
        output.append(row_output)
        i += 1
    print("Output len: ", len(output))
    return output

sessions = transform_train_data_to_session_sequences(train_data[:500000])

Train len:  500000
0
100000
200000
300000
400000
Output len:  500000


In [19]:
smaller_train = train_data.head(1000)
data = smaller_train.explode( "events" ).unnest( "events" )
data = data.to_pandas()
data = data.drop(columns=["ts", "type"])
data["rating"] = 1
dls = CollabDataLoaders.from_df(data, user_name='session', item_name='aid', bs=64, shuffle=False)
dls.show_batch()

learn = collab_learner(dls, n_factors=10)
learn.fit_one_cycle(5)



Unnamed: 0,session,aid,rating
0,2296652,956575.0,1
1,8167590,1672727.0,1
2,7498558,516731.0,1
3,3358924,1849337.0,1
4,7542339,577962.0,1
5,5906856,1131757.0,1
6,3251307,265336.0,1
7,5065716,460553.0,1
8,4244107,1579175.0,1
9,3707990,#na#,1


epoch,train_loss,valid_loss,time
0,0.988495,0.97962,00:01
1,0.922574,0.842664,00:01
2,0.777482,0.607599,00:01
3,0.588488,0.37951,00:01
4,0.414778,0.23348,00:01
5,0.285737,0.161146,00:01
6,0.198007,0.118486,00:01
7,0.139997,0.089901,00:01
8,0.102577,0.071675,00:01
9,0.078905,0.060241,00:01


In [13]:
print(learn.model)
print(learn)
print(type(learn))

EmbeddingDotBias(
  (u_weight): Embedding(1001, 10)
  (i_weight): Embedding(9339, 10)
  (u_bias): Embedding(1001, 1)
  (i_bias): Embedding(9339, 1)
)
<fastai.learner.Learner object at 0x7ddbf39d95d0>
<class 'fastai.learner.Learner'>
