# MIND Recommender 

In [25]:
# Importing required libraries
import pandas as pd

# Load the provided datasets
news_path = "../datasets/MINDsmall_dev/news.tsv"
behaviors_path = "../datasets/MINDsmall_dev/behaviors.tsv"

# Define column names for each dataset
news_columns = ["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
behaviors_columns = ["impressionId","userId","timestamp","click_history","impressions"]

# Load datasets
news_df = pd.read_csv(news_path, sep='\t', names=news_columns)
behaviors_df = pd.read_csv(behaviors_path, sep='\t', names=behaviors_columns)

# Display the first few rows of each dataset
news_sample = news_df.head()
behaviors_sample = behaviors_df.head()

In [None]:
news_sample

In [None]:
behaviors_sample

In [32]:
# Importing required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaN values with empty string for title and abstract
news_df['title'] = news_df['title'].fillna('')
news_df['abstract'] = news_df['abstract'].fillna('')

# Create a combined column of title and abstract for TF-IDF vectorization
news_df['content'] = news_df['title'] + " " + news_df['abstract']

# Create TF-IDF vectors for the news content
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_tfidf = vectorizer.fit_transform(news_df['content'])

# Calculate cosine similarity matrix for the news content
cosine_sim = cosine_similarity(news_tfidf, news_tfidf)

# Function to get similar news articles based on article ID
def get_similar_articles(article_id, num_recommendations=5):
    # Find index of the given article id
    idx = news_df[news_df['itemId'] == article_id].index[0]
    
    # Get similarity scores for the given article
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the articles by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices and scores of the top similar articles
    similar_articles = [(i[0], i[1]) for i in sim_scores[1:num_recommendations+1]]
    
    # Create DataFrame with similar articles and their similarity scores
    result = pd.DataFrame({
        'itemId': [news_df.iloc[i[0]]['itemId'] for i in similar_articles],
        'title': [news_df.iloc[i[0]]['title'] for i in similar_articles],
        'abstract': [news_df.iloc[i[0]]['abstract'] for i in similar_articles],
        'category': [news_df.iloc[i[0]]['category'] for i in similar_articles],
        'subcategory': [news_df.iloc[i[0]]['subcategory'] for i in similar_articles],
        'similarity_score': [i[1] for i in similar_articles]
    })
    
    return result

# Function to get recommended articles for a user based on click history
def get_recommended_articles_for_user(user_id, num_recommendations=5):
    # Get click history for the user
    click_history = behaviors_df[behaviors_df['userId'] == user_id]['click_history'].values[0].split(' ')
    
    # Get indices of articles in the click history
    click_indices = [news_df[news_df['itemId'] == article_id].index[0] for article_id in click_history if article_id in news_df['itemId'].values]
    
    # Calculate average similarity scores for all articles
    avg_sim_scores = cosine_sim[click_indices].mean(axis=0)
    
    # Get indices and scores of the top recommended articles
    recommended_articles = [(i, avg_sim_scores[i]) for i in avg_sim_scores.argsort()[::-1][:num_recommendations]]
    
    # Create DataFrame with recommended articles and their similarity scores
    result = pd.DataFrame({
        'itemId': [news_df.iloc[i[0]]['itemId'] for i in recommended_articles],
        'title': [news_df.iloc[i[0]]['title'] for i in recommended_articles],
        'abstract': [news_df.iloc[i[0]]['abstract'] for i in recommended_articles],
        'category': [news_df.iloc[i[0]]['category'] for i in recommended_articles],
        'subcategory': [news_df.iloc[i[0]]['subcategory'] for i in recommended_articles],
        'similarity_score': [i[1] for i in recommended_articles]
    })
    
    return result

In [None]:
# Get similar articles for a given article ID
similar_articles_df = get_similar_articles('N55528')
print("Similar Articles for Article ID 'N55528':")
similar_articles_df

In [None]:
# Get recommended articles for a given user ID
recommended_articles_df = get_recommended_articles_for_user('U80234')
print("\nRecommended Articles for User ID 'U80234':")
recommended_articles_df