In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the dataset
df = pd.read_csv(r"C:\Users\emmaj\Downloads\shared_articles.csv")

# Drop columns with missing values or unnecessary columns for the recommender
df = df.dropna(subset=['text'])  # Ensure 'text' column has no missing values

# Drop columns that are not needed for the content-based recommender
df_reduced = df[['contentId', 'title', 'text']]  # Only keep necessary columns

# TF-IDF Vectorization on the 'text' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_reduced['text'])

# Compute Cosine Similarity Matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Recommendation function
def get_recommendations(item_id, sim_matrix, n=10):
    if item_id < 0 or item_id >= sim_matrix.shape[0]:
        raise ValueError(f"Item {item_id} is not in the similarity matrix.")
    
    sim_scores = list(enumerate(sim_matrix[item_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Skip the first element since it is the item itself
    top_similar = sim_scores[1:n+1]
    rec_dict = {i[0]: i[1] for i in top_similar}
    return rec_dict

# Save only the necessary components in the model
content_model = {
    'tfidf_vectorizer': tfidf,
    'cosine_similarity': cosine_sim,
    'contentId_to_title': dict(zip(df_reduced['contentId'], df_reduced['title']))  # Map contentId to title for easy lookup
}

# Save the model to contentmodel.sav
save_path = r"C:\Users\emmaj\Downloads\contentmodel.sav"
with open(save_path, 'wb') as f:
    pickle.dump(content_model, f)

print(f"Model has been saved to {save_path}")

