In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

# step 1: Load the data
df = pd.read_csv('processed_news.csv')


In [22]:
# step 2 :  Extract text from the JSON-like columns
def extract_text(row):
    # Try to parse the JSON data
    try:
        entities = json.loads(row[0])  # First column appears to contain entities
        text = row[2]  # Third column appears to contain the article text
        return text
    except:
        return str(row)  # Fallback if parsing fails

# Create a clean text column
df['clean_text'] = df.apply(extract_text, axis=1)

  entities = json.loads(row[0])  # First column appears to contain entities
  text = row[2]  # Third column appears to contain the article text


In [23]:
# step 3: Define user interests
user_interests = "technology ai computers innovation"

In [24]:
# step 4 : Create TF-IDF vectors
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
article_vectors = tfidf.fit_transform(df['clean_text'].fillna(''))
user_vector = tfidf.transform([user_interests])

In [25]:
# step 5: Calculate similarities
df['similarity'] = cosine_similarity(user_vector, article_vectors)[0]

In [26]:
# step 6: Show top matching articles
# The third column appears to contain article text - we'll use that for display
if len(df.columns) > 2:
    display_column = df.columns[2]
else:
    display_column = 'clean_text'

top_articles = df.sort_values('similarity', ascending=False)[[display_column, 'similarity']].head(10)

print("\nTop recommended articles:")
for i, (text, score) in enumerate(zip(top_articles[display_column], top_articles['similarity']), 1):
    print(f"\n{i}. Score: {score:.3f}")
    print(text[:500] + "..." if len(text) > 500 else text)


Top recommended articles:

1. Score: 0.000
Here's Exactly What Kristen Bell Eats in a Day to Make 39 Look Like 29 Find out how the actress, entrepreneur, activist and mom of two ages with grace.

2. Score: 0.000
2019 Arizona high school girls volleyball state playoff preview: 4A, 5A and 6A conferences List of the top eight seeds in the 1A, 4A, 5A and 6A brackets, top storylines, best players, favored and sleeper team picks to win.

3. Score: 0.000
Local doctor says new migraine medicine brings hope The U.S. Food and Drug Administration has approved a new migraine medicine, expected to work within two hours. "Its exciting," said Dr. Andrea Synowiec, assistant Director for Allegheny Health Network's Headache Center at West Penn Hospital. "As we see patients come back and we know they don't have a good therapy, to be able to offer something that might work is really rewarding." The drug, Reyvow, was approved after two randomized,...

4. Score: 0.000
Mariota already benched with Winston l

In [27]:
# step 7: Save results
top_articles.to_csv('recommendations.csv', index=False)