## PYTHON For Recommender Deployment Project ##

Data Preparation

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import joblib

# Load the datasets
articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')

# Map the eventType to numerical ratings
event_type_strength = {
    'VIEW': 1,
    'LIKE': 2, 
    'FOLLOW': 3,
    'BOOKMARK': 4,
    'COMMENT CREATED': 5
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

# Group by user and item to get the strongest interaction for each user-item pair
grouped_interactions = interactions_df.groupby(['personId', 'contentId']).agg({'eventStrength': 'max'}).reset_index()

# Create a user-item matrix
user_item_matrix = grouped_interactions.pivot(index='personId', columns='contentId', values='eventStrength').fillna(0)
joblib.dump(user_item_matrix, './backend/models/user_item_matrix.sav')

**Collaborative Filtering Model**

In [None]:
# Calculate user similarity matrix
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def get_cf_recommendations(user_id, user_item_matrix, user_similarity_df, n=5):
    """
    Generate collaborative filtering recommendations for a given user
    """
    # Get similarity scores for this user with all other users
    if user_id not in user_similarity_df.index:
        print(f"User {user_id} not found in the dataset")
        return {}
        
    user_similarities = user_similarity_df.loc[user_id].drop(user_id)
    
    # Get items this user has interacted with
    user_items = user_item_matrix.loc[user_id]
    user_items = user_items[user_items > 0].index.tolist()
    
    # Find items that similar users have interacted with but this user hasn't
    recommendations = {}
    
    for similar_user, similarity in user_similarities.items():
        if similarity <= 0:  # Skip users with no similarity
            continue
        
        similar_user_items = user_item_matrix.loc[similar_user]
        similar_user_items = similar_user_items[similar_user_items > 0].index.tolist()
        
        # Get items this user hasn't interacted with
        new_items = [item for item in similar_user_items if item not in user_items]
        
        for item in new_items:
            if item in recommendations:
                recommendations[item] += similarity
            else:
                recommendations[item] = similarity
    
    # Sort recommendations by score and return top n
    recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n]
    return {item_id: score for item_id, score in recommendations}

# Save the model
joblib.dump(user_similarity_df, 'cf_model.sav')
joblib.dump(user_item_matrix, 'user_item_matrix.sav')

['user_item_matrix.sav']

**Content-Based Filtering Model**

In [None]:
# Prepare the content data
articles_df['content'] = articles_df['title'] + ' ' + articles_df['text']

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(articles_df['content'])

# Calculate item similarity matrix using cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_content_recommendations(item_id, cosine_sim, articles_df, n=5):
    """
    Generate content-based recommendations for a given item
    """
    # Check if the item exists
    if item_id not in articles_df['contentId'].values:
        print(f"Item {item_id} not found in the dataset")
        return {}
    
    # Get the index of the item in the dataframe
    idx = articles_df[articles_df['contentId'] == item_id].index[0]
    
    # Get similarity scores for this item with all other items
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort items based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top n most similar items (excluding the item itself)
    sim_scores = sim_scores[1:n+1]
    
    # Get the item indices
    item_indices = [i[0] for i in sim_scores]
    
    # Get the contentIds and similarity scores
    recommendations = {}
    for i, score in enumerate(sim_scores):
        item_idx = item_indices[i]
        item_id = articles_df.iloc[item_idx]['contentId']
        recommendations[item_id] = score[1]
    
    return recommendations

# Save the model
joblib.dump(cosine_sim, 'content_model.sav')

['content_model.sav']