# PACKAGE IMPORTS

<div class="alert alert-block alert-info"> 
These libraries and tools collectively provide a comprehensive set of capabilities for handling data (pandas, numpy), manipulating text (re, nltk), and performing advanced natural language processing tasks (nltk). They are widely used in data science, machine learning, and text analytics projects due to their efficiency and versatility.








In [None]:
import pandas as pd
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer


# DATA LOADING

In [None]:
train_data = pd.read_csv('Data/processed/test.csv')
test_data = pd.read_csv('Data/processed/train.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

# Data Cleaning and Preprocessing for Text Analysis

<div class="alert alert-block alert-info">  
This section covers the process of data cleaning, which involves preparing text data for analysis by removing errors and inconsistencies. It includes downloading NLTK packages, loading datasets, and cleaning the text by removing noise, punctuation, and converting to lowercase. The text is then tokenized, stop words are removed, and words are stemmed and lemmatized. Finally, the processed text is reassembled into strings, with an option to save the cleaned datasets to CSV files.








In [None]:

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the datasets
train_data = pd.read_csv('Data/processed/test.csv')
test_data = pd.read_csv('Data/processed/train.csv')

# Clean text: remove noise and punctuation, convert to lower case
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove all punctuations and special characters
    return text.strip().lower()

train_data['headlines'] = train_data['headlines'].apply(clean_text)
train_data['description'] = train_data['description'].apply(clean_text)
train_data['content'] = train_data['content'].apply(clean_text)

test_data['headlines'] = test_data['headlines'].apply(clean_text)
test_data['description'] = test_data['description'].apply(clean_text)
test_data['content'] = test_data['content'].apply(clean_text)

# Tokenization
train_data['headlines'] = train_data['headlines'].apply(word_tokenize)
train_data['description'] = train_data['description'].apply(word_tokenize)
train_data['content'] = train_data['content'].apply(word_tokenize)

test_data['headlines'] = test_data['headlines'].apply(word_tokenize)
test_data['description'] = test_data['description'].apply(word_tokenize)
test_data['content'] = test_data['content'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))

train_data['headlines'] = train_data['headlines'].apply(lambda x: [word for word in x if word not in stop_words])
train_data['description'] = train_data['description'].apply(lambda x: [word for word in x if word not in stop_words])
train_data['content'] = train_data['content'].apply(lambda x: [word for word in x if word not in stop_words])

test_data['headlines'] = test_data['headlines'].apply(lambda x: [word for word in x if word not in stop_words])
test_data['description'] = test_data['description'].apply(lambda x: [word for word in x if word not in stop_words])
test_data['content'] = test_data['content'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()

train_data['headlines'] = train_data['headlines'].apply(lambda x: [stemmer.stem(word) for word in x])
train_data['description'] = train_data['description'].apply(lambda x: [stemmer.stem(word) for word in x])
train_data['content'] = train_data['content'].apply(lambda x: [stemmer.stem(word) for word in x])

test_data['headlines'] = test_data['headlines'].apply(lambda x: [stemmer.stem(word) for word in x])
test_data['description'] = test_data['description'].apply(lambda x: [stemmer.stem(word) for word in x])
test_data['content'] = test_data['content'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()

train_data['headlines'] = train_data['headlines'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
train_data['description'] = train_data['description'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
train_data['content'] = train_data['content'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

test_data['headlines'] = test_data['headlines'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
test_data['description'] = test_data['description'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
test_data['content'] = test_data['content'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Convert lists back to strings
train_data['headlines'] = train_data['headlines'].apply(lambda x: ' '.join(x))
train_data['description'] = train_data['description'].apply(lambda x: ' '.join(x))
train_data['content'] = train_data['content'].apply(lambda x: ' '.join(x))

test_data['headlines'] = test_data['headlines'].apply(lambda x: ' '.join(x))
test_data['description'] = test_data['description'].apply(lambda x: ' '.join(x))
test_data['content'] = test_data['content'].apply(lambda x: ' '.join(x))

# Save the cleaned datasets (optional)
train_data.to_csv('train_cleaned.csv', index=False)
test_data.to_csv('test_cleaned.csv', index=False)


In [None]:
train_data.head()

In [None]:
test_data.head()

### Removing noise

removing unnecessary information to get the data into a usable format. The code remove the following.
- Remove the web-url for train_data.
- Remove the web-url for test_data.


In [None]:

pattern_url =  r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
url = r'url-web'
train_data['url'] = train_data['url'].replace(to_replace = pattern_url, value = url, regex = True)
test_data['url'] = test_data['url'].replace(to_replace = pattern_url, value = url, regex = True)
    


In [None]:
test_data.head()