Create the data frame

In [31]:
import sys
# 'sys.path' is a list of absolute path strings
sys.path.append('/Users/garethmiskimmin/streamline-app/api')
import user_movie_interactions.user_movie_interaction_service as user_movie_interaction_service
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

def __get_synthetic_rating(row):
    if row["interaction_type"] == "RATING":
        return float(row["rating"])
    elif row["interaction_type"] == "LIKE":
        return 8.5
    elif row["interaction_type"] == "WATCHED":
        return 6.5
    return None

df_external = df_external = pd.read_csv("../data/external_interactions_test.csv")
df_internal = pd.DataFrame(
    user_movie_interaction_service.get_all_user_interactions()
)

df_all = pd.concat([df_internal, df_external], ignore_index=True)

df_all['user_id'] = df_all['user_id'].astype(str)
df_all['movie_id'] = df_all['movie_id'].astype(str)


Do prediction stuff

In [None]:
df_all["rating"] = df_all.apply(__get_synthetic_rating, axis=1)
df_all = df_all.dropna(subset=["rating"])

ratings_matrix = (
    df_all.pivot_table(index="user_id", columns="movie_id", values="rating")
    .astype(float)
    .fillna(0)
)

# Apply SVD
svd = TruncatedSVD(n_components=50)
user_features = svd.fit_transform(ratings_matrix)
movie_features = svd.components_

# Predict ratings (dot product)
predicted_ratings = np.dot(user_features, movie_features)
predicted_df = pd.DataFrame(
    predicted_ratings, index=ratings_matrix.index, columns=ratings_matrix.columns
)

In [32]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

ratings_matrix = df_all.pivot_table(
        index="user_id", columns="movie_id", values="rating"
).astype(float).fillna(0)

In [33]:
# Apply SVD
svd = TruncatedSVD(n_components=50)
user_features = svd.fit_transform(ratings_matrix)
movie_features = svd.components_

In [58]:
# Predict ratings (dot product)
predicted_ratings = np.dot(user_features, movie_features)
predicted_df = pd.DataFrame(
    predicted_ratings, index=ratings_matrix.index, columns=ratings_matrix.columns
)

In [99]:
#Save with partioning

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as py_dataset
import shutil
import os

table = pa.Table.from_pandas(predicted_df)
path = 'parquet_predictions'
if os.path.exists(path):
        shutil.rmtree(path)
        
pq.write_to_dataset(
    table,
    root_path="parquet_predictions",
    partition_cols=["user_id"],  # ← partition by user
)

In [101]:
user_id = 9

dataset = py_dataset.dataset(f"parquet_predictions/user_id={user_id}", format="parquet")
table: pa.Table = dataset.to_table()
new_predicted_df: pd.DataFrame = table.to_pandas()
top10 = (
    new_predicted_df.iloc[0].sort_values(ascending=False).head(10).index.tolist()
)

something = new_predicted_df.iloc[0]

movie_score_series = something["6798244c6243f72901adb47f"]

movie_score = new_predicted_df.iloc[0]["6798244c6243f72901adb47f"]

# dataset = py_dataset.dataset("parquet_predictions", format="parquet")

# # Filter just a specific user_id
# user_id = "9"

# # Query only that partition (fast!)
# table = dataset.to_table(filter=(py_dataset.field("user_id") == user_id))
# df_user = table.to_pandas()

In [80]:
top10 = (
        predicted_df.loc['9'].sort_values(ascending=False).head(10).index.tolist()
    )

In [129]:
import datetime
import numpy as np
import pandas as pd
import sys
# 'sys.path' is a list of absolute path strings
sys.path.append('/Users/garethmiskimmin/streamline-app/api')
from user_movie_interactions import user_movie_interaction_service


INTERACTION_WEIGHTS = {
    "RATING": lambda row: row["rating"],  # use actual rating value
    "LIKE": lambda row: 1.5,
    "WATCHED": lambda row: 1.0,
    "REVIEW": lambda row: 1.2,
}

def create_ratings_matrix() -> pd.DataFrame:
    df_external = df_external = pd.read_csv("../data/external_interactions_test.csv")
    df_internal = pd.DataFrame(
        user_movie_interaction_service.get_all_user_interactions()
    )

    df_all = pd.concat([df_internal, df_external], ignore_index=True)

    df_all["user_id"] = df_all["user_id"].astype(str)
    df_all["movie_id"] = df_all["movie_id"].astype(str)

    df_all["interaction_score"] = df_all.apply(__compute_interaction_score, axis=1)
    df_all["interaction_score"] = df_all["interaction_score"].astype(float)

    df_all = __add_time_decay(df_all)
    
    df_grouped = (
        df_all.groupby(["user_id", "movie_id"])["adjusted_score"]
        .sum()
        .reset_index()
        .rename(columns={"adjusted_score": "final_score"})
    )

    df_grouped = __normalize_scores(df_grouped)

    df_grouped["final_score"] = df_grouped["final_score"].astype(float)

    ratings_matrix = (
        df_grouped.pivot_table(
            index="user_id", columns="movie_id", values="final_score"
        )
        .fillna(0)
        .astype(float)
    )
    
    return ratings_matrix


def __compute_interaction_score(row):
    score_fn = INTERACTION_WEIGHTS.get(row["interaction_type"])
    if score_fn:
        return score_fn(row)
    return 0


def __add_time_decay(user_interactions: pd.DataFrame) -> pd.DataFrame:
    now = datetime.datetime.now(tz=datetime.timezone.utc)

    user_interactions["created_at"] = pd.to_datetime(
        user_interactions["created_at"]
    ).dt.tz_localize("UTC", ambiguous="NaT", nonexistent="NaT")
    
    user_interactions["created_at"] = user_interactions["created_at"].fillna(now)


    user_interactions["days_ago"] = (
        now - user_interactions["created_at"]
    ).dt.days.clip(lower=0)

    decay_rate = 0.97
    user_interactions["time_decay"] = decay_rate ** user_interactions["days_ago"]
    # user_interactions["recency_boost"] = 2 / (
    #     1 + np.exp(user_interactions["days_ago"] / 10)
    # )

    user_interactions["adjusted_score"] = (
        user_interactions["interaction_score"]
        * user_interactions["time_decay"]
        # * user_interactions["recency_boost"]
    )

    return user_interactions


def __normalize_scores(df_grouped: pd.DataFrame) -> pd.DataFrame:
    # df_grouped["final_score"] = (
    #     df_grouped.groupby("user_id")["final_score"]
    #     .transform(lambda x: 10 * (x - x.min()) / (x.max() - x.min() + 1e-6))
    # )
    
    user_grouped = df_grouped.groupby("user_id")["final_score"]
    df_grouped["z_score"] = (
        df_grouped["final_score"] - user_grouped.transform("mean")
    ) / user_grouped.transform("std").replace(0, 1)

    return df_grouped

ratings_matrix = create_ratings_matrix()

In [6]:
from recommendation import ratings_matrix as rating_matrix_service
from recommendation import recommendation_service
import pandas as pd

# ratings_matrix = rating_matrix_service.create_ratings_matrix("./data/external_interactions_test.csv")

# cbf_df = recommendation_service.get_content_based_filtering_model(ratings_matrix)
# cf_df = recommendation_service.get_collaborative_filtering_model(ratings_matrix)

hybrid_df = pd.merge(cbf_df, cf_df, on=["user_id", "movie_id"], how="outer").fillna(0)

# # Combine scores with a weighted average
alpha = 0.7
hybrid_df["final_score"] = (
    alpha * hybrid_df["cf_score"] + (1 - alpha) * hybrid_df["content_score"]
)

In [13]:
from common.utils.utils import user_recommendations
from pymongo import UpdateOne
import datetime

def __store_predictions(predicted_df: pd.DataFrame):
    batch_size = 100  # Define the batch size
    operations = []
    
    predicted_df.reset_index

    # Process the DataFrame in batches
    for start in range(0, len(predicted_df), batch_size):
        end = start + batch_size
        batch: pd.DataFrame = predicted_df.iloc[start:end]
        return batch.to_dict(orient="records")
        for user_id, row in batch.iterrows():
            
            return batch, batch.iterrows(), user_id, row

            row_df = row.to_frame().T

            row_df = row_df.drop(columns=["user_id"], errors="ignore")

            # Convert to dictionary format
            predictions = row_df.to_dict(orient="records")

            operations.append(
                UpdateOne(
                    {"user_id": user_id},
                    {
                        "$set": {
                            "user_id": user_id,
                            "predictions": predictions,
                            "last_updated": datetime.datetime.now(
                                tz=datetime.timezone.utc
                            ),
                        }
                    },
                    upsert=True,
                )
            )

        # Execute the batch of operations
        if operations:
            return operations
            user_recommendations.bulk_write(operations)
            operations = []

# batch, iter_rows, index, rows = __store_predictions(hybrid_df)
dict_rows = __store_predictions(hybrid_df)

In [23]:
from recommendation import recommendation_service
internal_preds = hybrid_df[hybrid_df['user_id'].str.isnumeric()]

batch = [
                (
                    int(row["user_id"]),
                    row["movie_id"],
                    round(row["final_score"], 9),
                    round(row["cf_score"], 9),
                    round(row["content_score"], 9),
                    datetime.datetime.now(tz=datetime.timezone.utc),
                )
                for _, row in internal_preds.iterrows()
]

In [None]:
from recommendation import ratings_matrix as new_rating_matrix

ratings_sparse, user_id_lookup, movie_id_lookup = new_rating_matrix.create_ratings_matrix()

Function 'get_all_user_interactions' executed in 0.05 seconds.
Function '__compute_first_interaction_score' executed in 0.10 seconds.
Function '__add_time_decay' executed in 0.03 seconds.
Function '__normalize_scores' executed in 0.01 seconds.
Function 'create_ratings_matrix' executed in 11.18 seconds.


In [4]:
import pandas as pd

user_index_to_id = {v: k for k, v in user_id_lookup.items()}
movie_index_to_id = {v: k for k, v in movie_id_lookup.items()}

df = pd.DataFrame.sparse.from_spmatrix(
    ratings_sparse,
    index=[user_index_to_id[i] for i in range(ratings_sparse.shape[0])],
    columns=[movie_index_to_id[i] for i in range(ratings_sparse.shape[1])]
)

In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_array

svd = TruncatedSVD(n_components=50)
user_features = svd.fit_transform(ratings_sparse)
movie_features = svd.components_

In [7]:
predicted_ratings = (
        user_features @ movie_features
    )

In [31]:
vector = []
weights = [6,1,4,10,3,6,2,5,5]

for i in range(1, 10):
    row = [i, i+1, i+2, i+3, i+4, i+5, i+6, i+7, i+8, i+9, i+10]
    vector.append(row)

In [32]:
vectors = np.array(vector) # shape: [n_user_ratings x n_features]
weights_array = np.array(weights) # Initially shape: [n_user_ratings x 1] now shape: [1 x n_user_ratings]
# user_vector = (vectors * weights).sum(
#     axis=0
# ) / weights.sum()

In [34]:
weights_reshaped = weights_array.reshape(-1, 1)
print(weights_reshaped.shape)

(9, 1)


In [36]:
user_vector = (vectors * weights_reshaped).sum(
        axis=0
    ) / weights_reshaped.sum()

In [19]:
multi_array = (vectors * weights_array)

In [44]:
user_vector_re = user_vector.reshape(1, -1)
user_vector_re_re = user_vector_re.reshape(1, -1)