In [1]:
# step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# step 2 : Define the correct column names based on your file
column_names = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'events']
# Load the TSV file
df = pd.read_csv("news.tsv" , sep='\t', names=column_names, header=None)
# Display column names and first rows
print("Loaded Done:", df.columns.tolist())
df.head(10)

Loaded Done: ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'events']


Unnamed: 0,id,category,subcategory,title,abstract,url,entities,events
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."
5,N2073,sports,football_nfl,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...,https://assets.msn.com/labs/mind/AAJ4lap.html,"[{""Label"": ""National Football League"", ""Type"":...","[{""Label"": ""National Football League"", ""Type"":..."
6,N49186,weather,weathertopstories,It's been Orlando's hottest October ever so fa...,There won't be a chill down to your bones this...,https://assets.msn.com/labs/mind/AAJwoxD.html,"[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W...","[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W..."
7,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,https://assets.msn.com/labs/mind/AAJ43pw.html,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI..."
8,N24510,entertainment,gaming,Best PS5 games: top PlayStation 5 titles to lo...,Every confirmed or expected PS5 game we can't ...,https://assets.msn.com/labs/mind/AACHUn8.html,"[{""Label"": ""PlayStation"", ""Type"": ""J"", ""Wikida...",[]
9,N39237,news,newsscienceandtechnology,"How to report weather-related closings, delays","When there are active closings, view them here...",https://assets.msn.com/labs/mind/AAlErhA.html,[],"[{""Label"": ""WXII-TV"", ""Type"": ""M"", ""WikidataId..."


In [3]:
# Step 3: Basic Data Cleaning
# Fill missing abstracts
df['abstract'] = df['abstract'].fillna('')
# Combine title and abstract
df['content'] = df['title'] + ' ' + df['abstract']



In [4]:
# Step 4: Text Preprocessing
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['clean_content'] = df['content'].apply(clean_text)


In [5]:

# Step 5: Feature Extraction with TF-IDF
tfidf = TfidfVectorizer(max_features=1000)  # Using fewer features for simplicity
tfidf_matrix = tfidf.fit_transform(df['clean_content'])



In [6]:
# Step 6: Save Processed Data
df.to_csv("processed_news.csv", index=False)

print("Data preprocessing completed!")
print(f"Original shape: {df.shape}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

Data preprocessing completed!
Original shape: (7374, 10)
TF-IDF matrix shape: (7374, 1000)
