# MIND Recommender 

In [25]:
# Importing required libraries
import pandas as pd

# Load the provided datasets
news_path = "../datasets/MINDsmall_dev/news.tsv"
behaviors_path = "../datasets/MINDsmall_dev/behaviors.tsv"

# Define column names for each dataset
news_columns = ["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
behaviors_columns = ["impressionId","userId","timestamp","click_history","impressions"]

# Load datasets
news_df = pd.read_csv(news_path, sep='\t', names=news_columns)
behaviors_df = pd.read_csv(behaviors_path, sep='\t', names=behaviors_columns)

# Display the first few rows of each dataset
news_sample = news_df.head()
behaviors_sample = behaviors_df.head()

In [26]:
news_sample

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N18955,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [27]:
behaviors_sample

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U80234,11/15/2019 12:37:50 PM,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...
1,2,U60458,11/15/2019 7:11:50 AM,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...
2,3,U44190,11/15/2019 9:55:12 AM,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...
3,4,U87380,11/15/2019 3:12:46 PM,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...
4,5,U9444,11/15/2019 8:25:46 AM,N51692 N18285 N26015 N22679 N55556,N5940-1 N23513-0 N49285-0 N23355-0 N19990-0 N3...


In [32]:
# Importing required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaN values with empty string for title and abstract
news_df['title'] = news_df['title'].fillna('')
news_df['abstract'] = news_df['abstract'].fillna('')

# Create a combined column of title and abstract for TF-IDF vectorization
news_df['content'] = news_df['title'] + " " + news_df['abstract']

# Create TF-IDF vectors for the news content
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_tfidf = vectorizer.fit_transform(news_df['content'])

# Calculate cosine similarity matrix for the news content
cosine_sim = cosine_similarity(news_tfidf, news_tfidf)

# Function to get similar news articles based on article ID
def get_similar_articles(article_id, num_recommendations=5):
    # Find index of the given article id
    idx = news_df[news_df['itemId'] == article_id].index[0]
    
    # Get similarity scores for the given article
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the articles by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices and scores of the top similar articles
    similar_articles = [(i[0], i[1]) for i in sim_scores[1:num_recommendations+1]]
    
    # Create DataFrame with similar articles and their similarity scores
    result = pd.DataFrame({
        'itemId': [news_df.iloc[i[0]]['itemId'] for i in similar_articles],
        'title': [news_df.iloc[i[0]]['title'] for i in similar_articles],
        'abstract': [news_df.iloc[i[0]]['abstract'] for i in similar_articles],
        'category': [news_df.iloc[i[0]]['category'] for i in similar_articles],
        'subcategory': [news_df.iloc[i[0]]['subcategory'] for i in similar_articles],
        'similarity_score': [i[1] for i in similar_articles]
    })
    
    return result

# Function to get recommended articles for a user based on click history
def get_recommended_articles_for_user(user_id, num_recommendations=5):
    # Get click history for the user
    click_history = behaviors_df[behaviors_df['userId'] == user_id]['click_history'].values[0].split(' ')
    
    # Get indices of articles in the click history
    click_indices = [news_df[news_df['itemId'] == article_id].index[0] for article_id in click_history if article_id in news_df['itemId'].values]
    
    # Calculate average similarity scores for all articles
    avg_sim_scores = cosine_sim[click_indices].mean(axis=0)
    
    # Get indices and scores of the top recommended articles
    recommended_articles = [(i, avg_sim_scores[i]) for i in avg_sim_scores.argsort()[::-1][:num_recommendations]]
    
    # Create DataFrame with recommended articles and their similarity scores
    result = pd.DataFrame({
        'itemId': [news_df.iloc[i[0]]['itemId'] for i in recommended_articles],
        'title': [news_df.iloc[i[0]]['title'] for i in recommended_articles],
        'abstract': [news_df.iloc[i[0]]['abstract'] for i in recommended_articles],
        'category': [news_df.iloc[i[0]]['category'] for i in recommended_articles],
        'subcategory': [news_df.iloc[i[0]]['subcategory'] for i in recommended_articles],
        'similarity_score': [i[1] for i in recommended_articles]
    })
    
    return result

In [33]:
# Get similar articles for a given article ID
similar_articles_df = get_similar_articles('N55528')
print("Similar Articles for Article ID 'N55528':")
print(similar_articles_df)

Similar Articles for Article ID 'N55528':
   itemId                                              title  \
0   N9056  This Is What Queen Elizabeth Is Doing About th...   
1  N38133  The cutest photos of royal children and their ...   
2  N43522             Prince Charles is Getting Into Fashion   
3  N60671  Prince Charles Teared Up When Prince William T...   
4  N51725  Prince Charles Looks in Awe of Master Archie a...   

                                            abstract   category  \
0  According to royal insiders, Queen Elizabeth h...  lifestyle   
1  See all the cute photos of royal children with...  lifestyle   
2  Prince Charles is now getting into fashion wit...  lifestyle   
3          Frankly, it reduced me to tears, he said.  lifestyle   
4  The photo, posted by the Duke and Duchess of S...      video   

       subcategory  similarity_score  
0  lifestyleroyals          0.582324  
1  lifestyleroyals          0.517150  
2   lifestylevideo          0.481500  
3  lifestylero

In [34]:
# Get recommended articles for a given user ID
recommended_articles_df = get_recommended_articles_for_user('U80234')
print("\nRecommended Articles for User ID 'U80234':")
recommended_articles_df


Recommended Articles for User ID 'U80234':


Unnamed: 0,itemId,title,abstract,category,subcategory,similarity_score
0,N6616,Felicity Huffman Smiles as She Begins Communit...,The 56-year-old actress was released from pris...,news,newscrime,0.09953
1,N51741,Felicity Huffman begins prison sentence for co...,The actress will spend 14 days inside a federa...,tv,tv-celebrity,0.098364
2,N30924,The Rock's Gnarly Palm Is a Testament to Life ...,"Dwayne ""The Rock"" Johnson's gnarly palm was on...",health,fitness,0.0965
3,N46039,Hard Rock Hotel New Orleans collapse: Former s...,Structural engineer Walter Zehner worked on th...,news,newsus,0.090408
4,N28718,Felicity Huffman is released after serving 11 ...,Actress Felicity Huffman was released Friday f...,news,newsus,0.077547
