# Data Loading & Preprocessing

## 0. Import the required libraries

In [1]:
import nltk
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jefit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_percep

True

# 1. Load the data

In [2]:
# Load the preprocessed lowercase datasets, one for training and the testing one for predictions. 
df_training = pd.read_csv(r'C:\Users\jefit\OneDrive\Escritorio\Ironhack\Ironhack candela\Week 4\Project\Project-2-NLP-Challenge_Group3\data\training_data_lowercase.csv', header=None, names=['raw'])

df_testing = pd.read_csv(r'C:\Users\jefit\OneDrive\Escritorio\Ironhack\Ironhack candela\Week 4\Project\Project-2-NLP-Challenge_Group3\data\testing_data_lowercase_nolabels.csv', header=None, names=['raw'])

# 2. Visualize the imported dataset

In [3]:
# Show the first few rows
print("🔍 First 5 rows:")
print(df_training.head())

# Check basic info about datatypes and non-null values
print("\n📋 Dataset Info:")
print(df_training.info())

# Summary statistics for numerical and object columns
print("\n📊 Summary Statistics:")
print(df_training.describe(include='all'))

# Check for missing values
print("\n🧩 Missing Values:")
print(df_training.isnull().sum())

df_training.shape

🔍 First 5 rows:
                                                 raw
0  0\tdonald trump sends out embarrassing new yea...
1  0\tdrunk bragging trump staffer started russia...
2  0\tsheriff david clarke becomes an internet jo...
3  0\ttrump is so obsessed he even has obama‚s na...
4  0\tpope francis just called out donald trump d...

📋 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   raw     34152 non-null  object
dtypes: object(1)
memory usage: 266.9+ KB
None

📊 Summary Statistics:
                                                      raw
count                                               34152
unique                                              32206
top     1\tfactbox: trump fills top jobs for his admin...
freq                                                   14

🧩 Missing Values:
raw    0
dtype: int64


(34152, 1)

As we can see, the imported dataset only contains one column and 34,152 rows. It seems that the label and headline columns don't exist yet, so we'll have to create them and split the existing column into two ,to differentiate between the two. 

# 3. Data preprocessing

### 3.1 Split data into 'label' and 'headline' columns:

In [4]:
# Function to split the 'raw' column into 'label' and 'headline'. By defining a function, we can easily apply it to both training and testing datasets.

def split_label_headline(df, col='raw', drop_raw=True):
    """
    Splits a column into 'label' and 'headline' using tab as separator.
    Handles BOM and whitespace in label.
    """
    split_cols = df[col].str.split('\t', n=1, expand=True)
    if split_cols.shape[1] == 1:
        split_cols[1] = ''
    split_cols.columns = ['label', 'headline']
    split_cols['headline'] = split_cols['headline'].fillna('')
    # Remove BOM and whitespace from label
    split_cols['label'] = split_cols['label'].str.replace('\ufeff', '', regex=False).str.strip()
    df[['label', 'headline']] = split_cols
    df['label'] = df['label'].astype(int)
    if drop_raw:
        df.drop(columns=col, inplace=True)
    return df

# Apply to your datasets
df_training = split_label_headline(df_training)
df_testing = split_label_headline(df_testing)

In [5]:
df_training.shape, df_testing.shape

((34152, 2), (9984, 2))

In [6]:
print(df_training.head())

print(df_testing.head())

   label                                           headline
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
   label                                           headline
0      2  copycat muslim terrorist arrested with assault...
1      2  wow! chicago protester caught on camera admits...
2      2   germany's fdp look to fill schaeuble's big shoes
4      2  u.n. seeks 'massive' aid boost amid rohingya '...


### 3.2 Removing duplicate & missing values from the training dataset

In [7]:
def clean_dataframe(df, col='headline', label_col='label'):
    """
    Removes rows with missing or empty values in the column 'headline'
    and drops duplicate rows based on both label and headline columns.
    """
    df = df[df[col].notnull() & (df[col].str.strip() != '')]
    df = df.drop_duplicates(subset=[label_col, col])
    return df

# Only applied to the training dataset, because we don't want duplicates or missing values to affect our training process.
df_training = clean_dataframe(df_training)

In [8]:
df_training.shape, df_testing.shape

((32206, 2), (9984, 2))

### 3.3 Removing unwanted characters

In [9]:
def clean_headline(text):
    """
    Removes unwanted characters from text, keeping only letters, spaces, apostrophes, and hyphens.
    Also removes extra spaces.
    """
    import re
    # Remove unwanted characters except letters, numbers, spaces, apostrophes, and hyphens
    text = re.sub(r"[^a-zA-Z0-9\s'-]", '', text)
    # Replace multiple spaces/newlines/tabs with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove spaces at the beginning and end
    return text.strip()


df_training['headline'] = df_training['headline'].apply(clean_headline)
df_testing['headline'] = df_testing['headline'].apply(clean_headline)

In [10]:
print(df_training['headline'])

print(df_testing['headline'])

0        donald trump sends out embarrassing new years ...
1        drunk bragging trump staffer started russian c...
2        sheriff david clarke becomes an internet joke ...
3        trump is so obsessed he even has obamas name c...
4        pope francis just called out donald trump duri...
                               ...                        
34147    tears in rain as thais gather for late king's ...
34148    pyongyang university needs non-us teachers as ...
34149    philippine president duterte to visit japan ah...
34150    japan's abe may have won election but many don...
34151    demoralized and divided inside catalonia's pol...
Name: headline, Length: 32206, dtype: object
0       copycat muslim terrorist arrested with assault...
1       wow chicago protester caught on camera admits ...
2        germany's fdp look to fill schaeuble's big shoes
4       un seeks 'massive' aid boost amid rohingya 'em...
                              ...                        
9979    boom fox

### 3.3 Tokenize the data

In [11]:
def tokenize_headlines(df, col='headline', new_col='tokens'):
    """
    Tokenizes the values in 'headline' column and stores the result in a new column.
    """
    df[new_col] = df[col].apply(word_tokenize)
    return df

# Example usage:
df_training = tokenize_headlines(df_training)
df_testing = tokenize_headlines(df_testing)

In [12]:
df_training.shape, df_testing.shape

((32206, 3), (9984, 3))

In [13]:
df_training.head(), df_testing.head()

(   label                                           headline  \
 0      0  donald trump sends out embarrassing new years ...   
 1      0  drunk bragging trump staffer started russian c...   
 2      0  sheriff david clarke becomes an internet joke ...   
 3      0  trump is so obsessed he even has obamas name c...   
 4      0  pope francis just called out donald trump duri...   
 
                                               tokens  
 0  [donald, trump, sends, out, embarrassing, new,...  
 1  [drunk, bragging, trump, staffer, started, rus...  
 2  [sheriff, david, clarke, becomes, an, internet...  
 3  [trump, is, so, obsessed, he, even, has, obama...  
 4  [pope, francis, just, called, out, donald, tru...  ,
    label                                           headline  \
 0      2  copycat muslim terrorist arrested with assault...   
 1      2  wow chicago protester caught on camera admits ...   
 2      2   germany's fdp look to fill schaeuble's big shoes   
 4      2  un seeks '

### 3.4 Remove stopwords

In [14]:
def remove_stopwords(df, tokens_col='tokens', new_col='tokens_nostop', language='english'):
    """
    Removes stopwords from the tokenized headlines and stores the result in a new column.
    """
    stop_words = set(stopwords.words(language))
    df[new_col] = df[tokens_col].apply(lambda tokens: [word for word in tokens if word not in stop_words])
    return df

# Example usage:
df_training = remove_stopwords(df_training)
df_testing = remove_stopwords(df_testing)

In [15]:
df_training.shape, df_testing.shape

((32206, 4), (9984, 4))

In [16]:
df_training.head(), df_testing.head()

(   label                                           headline  \
 0      0  donald trump sends out embarrassing new years ...   
 1      0  drunk bragging trump staffer started russian c...   
 2      0  sheriff david clarke becomes an internet joke ...   
 3      0  trump is so obsessed he even has obamas name c...   
 4      0  pope francis just called out donald trump duri...   
 
                                               tokens  \
 0  [donald, trump, sends, out, embarrassing, new,...   
 1  [drunk, bragging, trump, staffer, started, rus...   
 2  [sheriff, david, clarke, becomes, an, internet...   
 3  [trump, is, so, obsessed, he, even, has, obama...   
 4  [pope, francis, just, called, out, donald, tru...   
 
                                        tokens_nostop  
 0  [donald, trump, sends, embarrassing, new, year...  
 1  [drunk, bragging, trump, staffer, started, rus...  
 2  [sheriff, david, clarke, becomes, internet, jo...  
 3  [trump, obsessed, even, obamas, name, code

### 3.5 Apply lemmatization

In [17]:
def lemmatize_tokens(df, tokens_col='tokens_nostop', new_col='tokens_lemmatized'):
    """
    Applies lemmatization to tokens_nostop column and stores the result in a new column.
    """
    lemmatizer = WordNetLemmatizer()
    df[new_col] = df[tokens_col].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
    return df

# Example usage:
df_training = lemmatize_tokens(df_training)
df_testing = lemmatize_tokens(df_testing)

In [18]:
df_training.shape, df_testing.shape

((32206, 5), (9984, 5))

In [19]:
df_training.head(), df_testing.head()

(   label                                           headline  \
 0      0  donald trump sends out embarrassing new years ...   
 1      0  drunk bragging trump staffer started russian c...   
 2      0  sheriff david clarke becomes an internet joke ...   
 3      0  trump is so obsessed he even has obamas name c...   
 4      0  pope francis just called out donald trump duri...   
 
                                               tokens  \
 0  [donald, trump, sends, out, embarrassing, new,...   
 1  [drunk, bragging, trump, staffer, started, rus...   
 2  [sheriff, david, clarke, becomes, an, internet...   
 3  [trump, is, so, obsessed, he, even, has, obama...   
 4  [pope, francis, just, called, out, donald, tru...   
 
                                        tokens_nostop  \
 0  [donald, trump, sends, embarrassing, new, year...   
 1  [drunk, bragging, trump, staffer, started, rus...   
 2  [sheriff, david, clarke, becomes, internet, jo...   
 3  [trump, obsessed, even, obamas, name, 