In [8]:
# Importing necessary libraries
import pandas as pd
import pickle
import re
from nltk import WordNetLemmatizer, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [9]:
# load the dataset
with open(r'C:\Users\hrish\OneDrive\Desktop\News_dataset.pickle', 'rb') as f:
    news = pickle.load(f)

In [10]:
news.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575


In [11]:
df = pd.DataFrame(news, columns=['Content', 'Category'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Content   2225 non-null   object
 1   Category  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


### Text cleaning and Tokenization

In [12]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = text.lower() # convert to lowercase
    tokens = word_tokenize(text) # tokenize the text
    return tokens

In [13]:
# Apply text cleaning and tokenization to each row
df['clean_text'] = df['Content'].apply(clean_text)

### lemmatization

In [14]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [16]:
# Apply lemmatization to each row
df['lemmatized_text'] = df['clean_text'].apply(lemmatize_text)

### Remove Stop Words

In [17]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [18]:
# Apply stop words removal to each row
df['processed_text'] = df['lemmatized_text'].apply(remove_stopwords)

### Label Encoding

In [20]:
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

### TF-IDF

In [28]:
# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text_str'])

# Example: To get feature names and their respective TF-IDF scores for the first document (if needed)
feature_names = tfidf_vectorizer.get_feature_names_out()
first_doc_features = tfidf_matrix[0].toarray().flatten()
tfidf_scores = pd.DataFrame(first_doc_features, index=feature_names, columns=["TF-IDF"])
tfidf_scores = tfidf_scores[tfidf_scores["TF-IDF"] > 0].sort_values(by="TF-IDF", ascending=False)

# Save TF-IDF representations to a DataFrame
tfidf_representation_df = pd.DataFrame((count, word) for word, count in zip(
    tfidf_matrix.toarray().tolist()[0], tfidf_vectorizer.get_feature_names_out()))
tfidf_representation_df.columns = ['Word', 'TF-IDF Score']
tfidf_representation_df.sort_values('TF-IDF Score', ascending=False, inplace=True)
tfidf_representation_df.head()


Unnamed: 0,Word,TF-IDF Score
27401,timewarner,0.487146
21674,profit,0.344867
3442,aol,0.257683
29256,warner,0.210784
23199,revenue,0.141471


### Saving the Processed Data

In [29]:
# Save the processed data and the TF-IDF vectorizer
with open('processed_data.pickle', 'wb') as f:
    pickle.dump(df, f)
with open('tfidf_vectorizer.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

## -----------------------------------

In [1]:
print("Performed by: Hrishikesh Bari || Roll No: 68")

Performed by: Hrishikesh Bari || Roll No: 68
