In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the dataset
df = pd.read_csv(r"C:\Users\emmaj\Downloads\shared_articles.csv")

# Drop rows with missing text
df = df.dropna(subset=['text'])

# Only keep necessary columns
df_reduced = df[['contentId', 'title', 'text']].reset_index(drop=True)

# TF-IDF Vectorization on the 'text' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_reduced['text'])

# Compute Cosine Similarity Matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Map original contentId to dataframe index
content_id_to_index = {cid: idx for idx, cid in enumerate(df_reduced['contentId'])}
index_to_content_id = {idx: cid for idx, cid in enumerate(df_reduced['contentId'])}

# Generate top 5 recommendations for each article
recommendations = []

for idx, row in df_reduced.iterrows():
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:6]  # Top 5, skipping itself

    for rec_idx, score in top_similar:
        recommendations.append({
            'contentId': index_to_content_id[idx],
            'recommendedContentId': index_to_content_id[rec_idx],
            'similarityScore': round(score, 4)
        })

# Save recommendations to CSV
csv_path = r"C:\Users\emmaj\Downloads\content_recommendations.csv"
rec_df = pd.DataFrame(recommendations)
rec_df.to_csv(csv_path, index=False)

print(f"Recommendation CSV saved to: {csv_path}")

# Save the model for web app use
content_model = {
    'tfidf_vectorizer': tfidf,
    'cosine_similarity': cosine_sim,
    'contentId_to_title': dict(zip(df_reduced['contentId'], df_reduced['title']))
}
model_path = r"C:\Users\emmaj\Downloads\contentmodel.sav"
with open(model_path, 'wb') as f:
    pickle.dump(content_model, f)

print(f"Model saved to: {model_path}")




Recommendation CSV saved to: C:\Users\emmaj\Downloads\content_recommendations.csv
Model saved to: C:\Users\emmaj\Downloads\contentmodel.sav
