In [1]:
import pandas as pd

# Load the cleaned data
df = pd.read_csv('../data/processed/cleaned_news.csv')

# Display basic information
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         44898 non-null  object
 1   text          44898 non-null  object
 2   subject       44898 non-null  object
 3   date          44898 non-null  object
 4   label         44898 non-null  int64 
 5   cleaned_text  44183 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.1+ MB
None
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chai

In [2]:
df['cleaned_text'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cleaned_text'].fillna('', inplace=True)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to 5000 features to reduce dimensionality
    stop_words='english',  # Remove common words like "the", "is"
    ngram_range=(1, 2)  # Unigrams and bigrams for more context
)

# Apply TF-IDF transformation
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Save the vectorizer for future use
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.pkl')

# Convert TF-IDF matrix to array and store
import numpy as np
np.save('../data/processed/X_tfidf.npy', X_tfidf.toarray())

print("TF-IDF transformation completed. Shape:", X_tfidf.shape)


TF-IDF transformation completed. Shape: (44898, 5000)


In [4]:
# Save the processed data
np.save('../data/processed/X_tfidf.npy', X_tfidf.toarray())
df['label'].to_csv('../data/processed/labels.csv', index=False)

print("TF-IDF features and labels saved successfully.")


TF-IDF features and labels saved successfully.


In [5]:
# Load processed features to verify
X_loaded = np.load('../data/processed/X_tfidf.npy')
print("Loaded feature shape:", X_loaded.shape)


Loaded feature shape: (44898, 5000)
