In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
column_names = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'events']
df = pd.read_csv(r'data/news.tsv', sep='\t', names=column_names, header=None)
df['content'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_content'] = df['content'].apply(preprocess_text)



In [None]:

tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_content'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [None]:

liked_article_indices = [30,44,96] 
df.iloc[liked_article_indices][['title', 'abstract']]


Unnamed: 0,title,abstract
30,"Without help from US, UN climate fund struggle...",Rich countries gathered Thursday in France to ...
44,These Cranberry Sauce Recipes Are Perfect for ...,You'll never want the store-bought version eve...
96,Early symptoms of dementia: Be aware of subtle...,Would you be able to recognize the symptoms of...


In [None]:

user_profile_matrix = tfidf_df.iloc[liked_article_indices]

user_profile_vector = user_profile_matrix.mean(axis=0)

In [None]:

top_terms = user_profile_vector.sort_values(ascending=False).head(20)
print("Top interests of the user:")
print(top_terms)


Top interests of the user:
symptom         0.221117
sauce           0.124755
fund            0.123864
dementia        0.120002
climate         0.119251
recognize       0.118269
bought          0.116473
aware           0.114535
version         0.113215
dinner          0.105256
recipe          0.098089
perfect         0.097908
thanksgiving    0.097162
store           0.090336
ever            0.088664
able            0.087674
help            0.086526
never           0.086171
want            0.077393
un              0.075004
dtype: float64
