In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import umap.umap_ as umap

from modules import processor

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
PATH = '/Users/kaledinaoa/data/data_sber_market_test_ml/'

train_raw_data = processor.load_csv("train", PATH)
sample_sumbission = processor.load_csv("sample_submission", PATH)

In [None]:
train_raw_data.head(20)
test_df = train_raw_data.copy()

In [None]:
test_df.head()

In [None]:
# user_item_matrix

user_item_matrix = pd.pivot_table(test_df, 
                                  index='user_id', columns='cart', 
                                  values='order_completed_at',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
 
user_item_matrix = user_item_matrix.astype(float) 
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)


In [None]:
count_unique_carts = test_df.cart.unique().shape[0]
count_unique_users = test_df.user_id.unique().shape[0]

print("unique users = ",count_unique_users)
print("unique carts = ",count_unique_carts)

### model ALS

In [None]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight

In [None]:
itemid_to_id, userid_to_id, id_to_itemid, id_to_userid = processor.some_ids_preparation_to_als(user_item_matrix)

In [None]:
%%time 

def building_model(factors):
    model_als = AlternatingLeastSquares(factors=factors, # скрытые факторы
                                    regularization=0.001, # регуляризация, от переобучения модели
                                    iterations=30, 
                                    calculate_training_loss=True)
    
    model_als.fit(csr_matrix(user_item_matrix).tocsr(), show_progress=True)
    return model_als

model_als = building_model(10)

In [None]:
model = model_als

# MODEL INFORMATION
print('item_factors:', model.item_factors.shape)
print('user_factors:', model.user_factors.shape)

# эмбеддинги items
item_factors=pd.DataFrame(model.item_factors)

# эмбеддинги users
user_factors=pd.DataFrame(model.user_factors)
# user_factors.head(3)

In [None]:
reducer = umap.UMAP(n_components=2) # to 2-dimension
umap_embeddings_users = reducer.fit_transform(user_factors)
umap_embeddings_items = reducer.fit_transform(item_factors)

In [None]:
processor.figure_umap_embeddings(model, umap_embeddings_users, 'UMAP Visualization of User Embeddings')

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from modules import processor

### test user

In [None]:

TEST_USER_ID = 2

def reccomendation(model, n,  userid, user_item_matrix, userid_to_id):
    recs = model.recommend(userid=userid_to_id[userid],  
                        user_items=csr_matrix(user_item_matrix.loc[userid_to_id[TEST_USER_ID]]).tocsr(),  
                        N=n, 
                        recalculate_user=True,
                        filter_already_liked_items=False)
    return recs[0]

recs = reccomendation(model_als, 10, TEST_USER_ID, user_item_matrix, userid_to_id)
recs

In [None]:
user_df = test_df[test_df["user_id"]==TEST_USER_ID]

In [None]:
def calculate_counttable_by_columnname(df, column_name):
        """
        Calculate a count table for a column.
        
        Args:
            df (DataFrame): The DataFrame to calculate the count table for.
            column_name (str): The name of the column to calculate the count table for.
            
        Returns:
            DataFrame: The count table.
        """
        counts = df[column_name].value_counts().reset_index()
        counts.columns = [column_name, 'count']
        counts = counts.sort_values(by='count')
        return counts

calculate_counttable_by_columnname(user_df, 'cart').sort_values(by="count",ascending=False).head(5)

In [None]:
# user_df

### reccomend

In [None]:
k = 30
score_more = 0.99
test_users = test_df.user_id.unique()
test_carts = test_df.cart.unique()
# k = test_carts.shape[0]


In [None]:
def calculate_recommendations_for_user(userid, model, k, user_item_matrix=user_item_matrix, userid_to_id=userid_to_id):

    def reccomendation(model, n,  userid, user_item_matrix, userid_to_id):
        ids, scores = model.recommend(userid=userid_to_id[userid],  
                            user_items=csr_matrix(user_item_matrix.loc[userid_to_id[userid]]).tocsr(),  
                            N=n, 
                            recalculate_user=True,
                            filter_already_liked_items=False)
        return ids, scores
    
    return reccomendation(model, k, userid, user_item_matrix, userid_to_id)


In [None]:
userid = 1

ids, scores = calculate_recommendations_for_user(userid, model, k)

test_results = pd.DataFrame(columns=['user_id','cart_id','target'])

for userid in [userid]:

    ids, scores = calculate_recommendations_for_user(userid, model, k)

    for i in range(k):
    test_results=pd.concat([pd.DataFrame([
            {   
                'user_id': userid, 
                'cart_id': cart_id, 
                'target': target
            }]),
            test_results],ignore_index=True)

In [None]:
ids

In [None]:
scores

## submit df

In [None]:
submit_df = sample_sumbission.copy()

# Разделить каждую ячейку в колонке на два столбца
submit_df[['user_id', 'cart_id']] = submit_df['id'].str.split(';', expand=True)
submit_df['user_id'] = submit_df['user_id'].astype(int)
submit_df['cart_id'] = submit_df['cart_id'].astype(int)

In [None]:
submit_df.head(2)

In [None]:
# create reccomended df

def create_and_save_df_of_reccomended_lists(submit_df):

    users_for_predictions = submit_df.user_id.unique()
    cart_user = submit_df[['user_id', 'cart_id']]

    list_of_carts = pd.DataFrame(columns=['user_id','carts'])

    for userid in users_for_predictions:
        
        carts = cart_user[cart_user["user_id"]==userid].cart_id.to_list()

        list_of_carts=pd.concat([pd.DataFrame([
                {   
                    'user_id': userid, 
                    'carts': carts
                }]),
                list_of_carts],ignore_index=True)
        
    lists_rec = list_of_carts.sort_values(by='user_id').reset_index(drop=True)
    lists_rec.to_csv('lists_rec.csv', encoding='utf-8')
    return lists_rec

# lists_rec = create_and_save_df_of_reccomended_lists(submit_df)
lists_rec = pd.read_csv('lists_rec.csv', encoding='utf-8')
lists_rec.drop(columns=['Unnamed: 0'], inplace=True)
lists_rec.set_index('user_id', inplace=True)

In [None]:
lists_rec.head(5)

In [None]:
def calculate_recommendations_for_user(userid, model, k, user_item_matrix=user_item_matrix, userid_to_id=userid_to_id):

    def reccomendation(model, n,  userid, user_item_matrix, userid_to_id):
        ids, scores = model.recommend(userid=userid_to_id[userid],  
                            user_items=csr_matrix(user_item_matrix.loc[userid_to_id[userid]]).tocsr(),  
                            N=n, 
                            recalculate_user=True,
                            filter_already_liked_items=False)
        return ids, scores
    
    return reccomendation(model, k, userid, user_item_matrix, userid_to_id)


In [None]:
carts_value = lists_rec.loc[0, 'carts']
carts_list = eval(carts_value)
# carts_list
# eval(lists_rec.loc[0, 'carts'])

In [None]:
# users_ids = lists_rec["user_id"].unique()

users_ids = [0]
score_level = 0.7

df_test_results = pd.DataFrame(columns=['user_id','cart_id','target'])


def concat_to_df(df, userid, cartid, target):
    df=pd.concat([pd.DataFrame([
                    {   
                        'user_id': userid, 
                        'cart_id': cartid, 
                        'target': target
                    }]),
                    df],ignore_index=True)
    return df

for userid in users_ids:

    target = 0

    unique_cart_items = test_df[test_df["user_id"]==userid]["cart"].unique().tolist()
    k = len(unique_cart_items)
    carts_for_recs = eval(lists_rec.loc[userid, 'carts'])

    ids, scores = calculate_recommendations_for_user(userid, model, k)
    ids = ids.tolist()
    recs_for_user = pd.DataFrame({'score': scores}, index=ids)

    for cart in carts_for_recs:

        if cart in ids:
            score = float(recs_for_user.loc[cart,'score'])

            if score > score_level:
                target = 1
            else:
                terget = 0
        
            df_test_results=concat_to_df(df_test_results, userid, cart, target)

        else:
            target = 0
            df_test_results=concat_to_df(df_test_results, userid, cart, target)

In [None]:
df_test_results.head(30)