<a href="https://colab.research.google.com/github/kishore07/AI_ML_Project/blob/main/News_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
data = []
with open('News_Category_Dataset_v3.json', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue  # Skip malformed lines
data[1]

{'link': 'https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02',
 'headline': '23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23)',
 'category': 'COMEDY',
 'short_description': '"Until you have a dog you don\'t understand what could be eaten."',
 'authors': 'Elyse Wanshel',
 'date': '2022-09-23'}

In [28]:
df = pd.DataFrame(data)

print(f"Dataset loaded: {len(df)} news articles")
print("Categories:", df['category'].unique())
print("Sample columns:", df.columns.tolist())

Dataset loaded: 12916 news articles
Categories: ['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS']
Sample columns: ['link', 'headline', 'category', 'short_description', 'authors', 'date']


In [29]:
def recommend_news_by_category(category_name, top_n=5):

    category_articles = df[df['category'] == category_name]

    if len(category_articles) == 0:
        return f"No articles found for category: {category_name}"


    texts = (category_articles['headline'] + ' ' +
             category_articles['short_description'].fillna('')).tolist()


    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2
    )
    tfidf_matrix = vectorizer.fit_transform(texts)


    similarity_matrix = cosine_similarity(tfidf_matrix)
    idx = 0
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    recommendations = []
    for i, score in sim_scores:
        article = category_articles.iloc[i]
        recommendations.append({
            'headline': article['headline'],
            'category': article['category'],
            'authors': article.get('authors', 'N/A'),
            'date': article.get('date', 'N/A'),
            'link': article.get('link', 'N/A'),
            'similarity_score': round(score, 3)
        })

    return recommendations


In [23]:
if __name__ == "__main__":
    print("\nTop categories:")
    print(df['category'].value_counts().head(10))


Top categories:
category
POLITICS         4455
ENTERTAINMENT    2198
WORLD NEWS       1382
U.S. NEWS        1376
COMEDY            486
BLACK VOICES      280
MEDIA             274
CRIME             269
SPORTS            257
QUEER VOICES      256
Name: count, dtype: int64


In [31]:
category = 'POLITICS'
print("\nTop 10 recommendations for '{category}':")
recs = recommend_news_by_category(category, top_n=10)
for i, rec in enumerate(recs, 1):
        print(f"{i}. {rec['headline']}")
        print(f"   Score: {rec['similarity_score']} | Authors: {rec['authors']}")
        print(f"   Date: {rec['date']} | Link: {rec['link']}\n")


Top 10 recommendations for '{category}':
1. Trump Hopes You Forget How He Praised China And The WHO Before Blaming Them
   Score: 0.518 | Authors: S.V. Date
   Date: 2020-05-19 | Link: https://www.huffpost.com/entry/trump-china-reversal-coronavirus-blame_n_5ec44c7bc5b61e42ad3d8876

2. Biden Says 'Substance Of Charge' Against Trump Was 'Not In Dispute'
   Score: 0.39 | Authors: Mary Papenfuss
   Date: 2021-02-14 | Link: https://www.huffpost.com/entry/biden-trump-impeachment-acquittal-statement_n_6028dcdcc5b680717ee860b6

3. American Expert Axed From CDC Post In China Months Before Coronavirus Outbreak
   Score: 0.39 | Authors: Marisa Taylor, Reuters
   Date: 2020-03-22 | Link: https://www.huffpost.com/entry/cdc-expert-axed-china_n_5e77dd7bc5b63c3b6492aba3

4. China Trolls Trump Over Report It's Hacked His iPhone
   Score: 0.371 | Authors: Ryan Grenoble
   Date: 2018-10-25 | Link: https://www.huffpost.com/entry/china-trump-hacked-iphone-huawei_n_5bd1e5b3e4b055bc9489a3ad

5. Kudlow Claim

In [21]:
df[df['category'] == 'POLITICS']

Unnamed: 0,link,headline,category,short_description,authors,date
20,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19
23,https://www.huffpost.com/entry/ukraine-festiva...,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,An annual celebration took on a different feel...,Jonathan Nicholson,2022-09-19
29,https://www.huffpost.com/entry/europe-britain-...,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS,"U.S. President Joe Biden, in London for the fu...","Darlene Superville, AP",2022-09-18
39,https://www.huffpost.com/entry/afghan-adjustme...,Bill To Help Afghans Who Escaped Taliban Faces...,POLITICS,Republican outrage over the shoddy U.S. withdr...,Hamed Ahmadi and Arthur Delaney,2022-09-16
43,https://www.huffpost.com/entry/capitol-riot-in...,Mark Meadows Complies With Justice Dept. Subpo...,POLITICS,The former White House chief of staff has turn...,"ERIC TUCKER, AP",2022-09-15
...,...,...,...,...,...,...
12896,https://www.huffingtonpost.com/entry/orange-co...,A Mass Shooting Tore Their Lives Apart. A Corr...,POLITICS,As prosecutors fixated on the death penalty fo...,Matt Ferner,2018-03-09
12900,https://www.huffingtonpost.com/entry/trump-mis...,Trump Allegedly Cut Women Of Color From Miss U...,POLITICS,A new book excerpt says Trump thought certain ...,Alanna Vagianos,2018-03-09
12901,https://www.huffingtonpost.com/entry/rob-porte...,White House Refuses House Investigation Reques...,POLITICS,"Rep. Elijah Cummings, the top Democrat on the ...",Marina Fang,2018-03-09
12907,https://www.huffingtonpost.com/entry/donald-tr...,Donald Trump Will Meet With Kim Jong Un By May...,POLITICS,"Kim reportedly has expressed his ""eagerness to...",Nick Visser,2018-03-09
