# Data Pre-Processing

In [9]:
# Imported libraries
import pandas as pd
import json
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# Load the HDLU Headline Grouping Datasets - training, validation, testing
# Convert the labelled data from .json to .csv

# train.csv
with open('train.json','r') as train_file:
    train_data = json.load(train_file)

train_df = pd.DataFrame(train_data)

train_df.to_csv('train.csv', index=False)


def json_to_csv(json_file, csv_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False)
    return df

# test.csv
test_df = json_to_csv('test.json', 'test.csv')

# dev.csv
dev_df = json_to_csv('dev.json', 'dev.csv')

print(type(train_df),type(test_df),type(dev_df))


# Current df name = train_df #

JSONDecodeError: Unterminated string starting at: line 108027 column 19 (char 6183429)

# Inspection of Training Data

In [12]:
def inspect_data(df, name):
    print(f"DATA DISTRIBUTION FOR THE {name} DATASET\n")
    print(f"The shape of the {name} dataset is (rows,columns): {df.shape}\n")
    print("The first five rows of the dataset:")
    display(df.head())
    print(f"Information about the {name} dataset:\n")
    df.info()

    print(f"\n\nThe number of missing values in the {name} dataset:\n{df.isnull().sum()}\n")
    rounded_describe = df.describe().round(2)
    print(f"Description of {name}'s numerical features (rounded to 2 d.p.):\n")
    display(rounded_describe)

    sns.histplot(df['timeline_id'], binwidth=0.5, edgecolor ='black')
    plt.xlabel('Timeline ID')   # Fix alignment of plt.xticks() to the center of bar
    plt.ylabel('Count')
    plt.title('ID Frequency')
    plt.grid(False)
    plt.show()

    print()
    for column in df.columns:
        print(f"Value counts for column {column} in the {name} dataset:\n{df[column].value_counts()}\n\n")
    print()
    for column in df.columns:
        print(f"The number of duplicate entries in column {column} of {name} dataset: {df.duplicated().sum()}\n")
    print()
    for column in df.columns:
        print(f"The number of unique entries in column {column} of {name} dataset: {df[column].nunique()}\n")

    print(f"Are there any duplicated rows in the {name} dataset? (T/F)\n{test_df.duplicated().any()}\n")

    print(f"Correlation matrix for numerical features in {name} dataset:\n")
    corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
    display(corr_matrix)

    headline_columns = ['headline_a', 'headline_b']
    for column in headline_columns:
        print(f"Character lengths in column {column} of {name} dataset:")
        length = train_df[column].str.len()
        print(f"Minimum length: {length.min()}")
        print(f"Maximum length: {length.max()}")
        print(f"Mode length: {length.mode().values}")
        print(f"Average length: {length.mean():.1f}")
        print(f"Median length: {length.median()}")
        print(f"Standard deviation: {length.std():.1f}\n\n")

    label_counts = df['label'].value_counts()
    majority_class_count = label_counts.iloc[0]
    minority_class_count = label_counts.iloc[1]
    imbalance_ratio = majority_class_count / minority_class_count
    total_count = majority_class_count + minority_class_count
    minority_class_proportion = minority_class_count / total_count
    majority_class_proportion = majority_class_count / total_count

    print(f"\n\nLabel Imbalance Ratio: {imbalance_ratio.round(2)}")
    print(f"\nThe proportion of the {name} dataset belonging to the minority class: {(100 * minority_class_proportion).round(2)}%")
    print(f"\nThe proportion of the {name} dataset belonging to the majority class: {(100 * majority_class_proportion).round(2)}%\n\n")

    label_counts.plot(kind='pie', autopct="%1.1f%%", colors=["lightcoral", "skyblue"])
    plt.title(f'Label Distribution in Training Dataset')
    plt.ylabel('Count')
    plt.yticks(rotation=45) # not working for pie chart 'Count
    plt.show()

# inspect_data(test_df, "Original Testing")
# print("\n\n")
# inspect_data(dev_df, "Original Validation")


# Current df name = train_df #

In [14]:
train_df = 'train.csv'
inspect_data(train_df, "'Original Training'")

# Current df name = train_df #

DATA DISTRIBUTION FOR THE 'Original Training' DATASET



AttributeError: 'str' object has no attribute 'shape'

# Duplicate Assessment

In [None]:
# Checking for Identical Rows

print(f"Are there any duplicated rows in the dataset? (T/F)\n{train_df.duplicated().any()}\n")

duplicate_rows = train_df[train_df.duplicated()]

print("Number of duplicate rows:", len(duplicate_rows))

display(duplicate_rows)

print("\nShape of dataframe containing duplicate entries:")
duplicate_rows.shape

# if needed, train_df = train_df.drop_duplicates()


    # Current df name = train_df #

Are there any duplicated rows in the dataset? (T/F)
False

Number of duplicate rows: 0


Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label



Shape of dataframe containing duplicate entries:


(0, 8)

In [None]:
# Checking for Identical Headlines - in both 'headline_a' and 'headline_b'

same_headlines = train_df[train_df['headline_a'] == train_df['headline_b']]

print(same_headlines.index)

print(f"\nRows with the same headlines`:\n")
{display(same_headlines)}

print(same_headlines.shape)
print()

train_df_no_identical_headlines = train_df.copy()

# Double-checking that the indexes align
display(train_df.iloc[1235])
print()
display(train_df_no_identical_headlines.iloc[1235])
print()
display(same_headlines.iloc[0])


train_df_no_identical_headlines = train_df_no_identical_headlines.drop(same_headlines.index)


# Check that identical headlines have been deleted
same_headlines_clean = train_df_no_identical_headlines[train_df_no_identical_headlines['headline_a'] == train_df_no_identical_headlines['headline_b']]

display(same_headlines_clean)
print(same_headlines_clean.shape)


    # Current df name = train_df_no_identical_headlines #

Index([ 1235,  1512,  1692,  1731,  2211,  2323,  2502,  2878,  2882,  4241,
        4277,  4636,  4707,  5342,  6091,  6827,  8578, 10728, 11388, 11695,
       12182, 12251, 13210],
      dtype='int64')

Rows with the same headlines`:



Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label
1235,2,Ireland to hold abortion referendum in May or ...,Ireland to hold abortion referendum in May or ...,2018-01-29,2018-01-29,http://www.cnn.com/2018/01/29/europe/ireland-a...,https://www.cnn.com/2018/01/29/europe/ireland-...,1
1512,3,USDA has $80 million-$90 million to fight bird...,USDA has $80 million-$90 million to fight bird...,2017-03-10,2017-03-10,http://www.reuters.com/article/us-health-birdf...,http://www.foxnews.com/health/2017/03/10/usda-...,1
1692,3,U.S. reports low pathogenic bird flu outbreak ...,U.S. reports low pathogenic bird flu outbreak ...,2017-03-07,2017-03-08,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/us-health-birdf...,1
1731,3,Bird flu hits another U.S. farm that supplies ...,Bird flu hits another U.S. farm that supplies ...,2017-03-17,2017-03-17,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,1
2211,3,"U.S. chicken, egg companies heighten security ...","U.S. chicken, egg companies heighten security ...",2017-03-07,2017-03-07,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,1
2323,3,Alabama reports three cases of bird flu in pou...,Alabama reports three cases of bird flu in pou...,2017-03-14,2017-03-15,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,1
2502,3,French foie gras bill will swell with bird flu...,French foie gras bill will swell with bird flu...,2017-03-10,2017-03-10,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,1
2878,2,Final campaign push before bitter Irish aborti...,Final campaign push before bitter Irish aborti...,2018-05-24,2018-05-25,http://www.france24.com/en/20180524-final-camp...,https://timesofindia.indiatimes.com/world/euro...,1
2882,2,Ireland votes in 'once-in-a-generation' aborti...,Ireland votes in 'once-in-a-generation' aborti...,2018-05-25,2018-05-25,https://www.reuters.com/article/us-ireland-abo...,https://timesofindia.indiatimes.com/world/euro...,1
4241,6,Tunisia arrests another 150 including oppositi...,Tunisia arrests another 150 including oppositi...,2018-01-12,2018-01-12,http://www.middleeasteye.net/news/tunisia-arre...,https://www.reuters.com/article/us-tunisia-pro...,1


(23, 8)



timeline_id                                                    2
headline_a     Ireland to hold abortion referendum in May or ...
headline_b     Ireland to hold abortion referendum in May or ...
date_a                                                2018-01-29
date_b                                                2018-01-29
url_a          http://www.cnn.com/2018/01/29/europe/ireland-a...
url_b          https://www.cnn.com/2018/01/29/europe/ireland-...
label                                                          1
Name: 1235, dtype: object




timeline_id                                                    2
headline_a     Ireland to hold abortion referendum in May or ...
headline_b     Ireland to hold abortion referendum in May or ...
date_a                                                2018-01-29
date_b                                                2018-01-29
url_a          http://www.cnn.com/2018/01/29/europe/ireland-a...
url_b          https://www.cnn.com/2018/01/29/europe/ireland-...
label                                                          1
Name: 1235, dtype: object




timeline_id                                                    2
headline_a     Ireland to hold abortion referendum in May or ...
headline_b     Ireland to hold abortion referendum in May or ...
date_a                                                2018-01-29
date_b                                                2018-01-29
url_a          http://www.cnn.com/2018/01/29/europe/ireland-a...
url_b          https://www.cnn.com/2018/01/29/europe/ireland-...
label                                                          1
Name: 1235, dtype: object

Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label


(0, 8)


In [None]:
# Checking for Swapped Headlines - in 'headline_a' and 'headline_b'

# Confirm new database shape after dropping rows with same headline in both columns
print(f"Shape after dropping rows with identical headlines: {train_df_no_identical_headlines.shape}")

# Create a copy of dataframe
train_df_unswapped = train_df_no_identical_headlines.copy()

# Create a column that combines 'headline_a' and 'headline_b' - irrespective of order
train_df_unswapped['combined_headline'] = train_df_unswapped[['headline_a', 'headline_b']].apply(lambda x: ' '.join(sorted(x)), axis=1)

# Count occurrences each combined headline pair - irrespective of order
swapped_counts = train_df_unswapped['combined_headline'].value_counts()

# Display filtered dataframe of rows where 'combined_headline' has been duplicated
print("\nDuplicated swapped headlines:\n")
swapped_duplicates = train_df_unswapped[train_df_unswapped['combined_headline'].isin(swapped_counts[swapped_counts > 1].index)]

display(swapped_duplicates)

# Save the original index before sorting
train_df_unswapped['original_index'] = train_df_unswapped.index

# Sort dataframe by 'label', with the value 1 coming first
train_df_unswapped = train_df_unswapped.sort_values(by='label', ascending=False)

# Remove duplicate rows based on 'combined_headline' column, keeping the first occurrence
train_df_unswapped = train_df_unswapped.drop_duplicates(subset='combined_headline', keep='first')

# Sort the dataframe back to its original order using 'original_index'
train_df_unswapped = train_df_unswapped.sort_values(by='original_index').drop(columns='original_index') # Ensure this has been done!!!

print(f"\n\nShape after dropping duplicated swapped headlines: {train_df_unswapped.shape}")

print("\nThe first five lines of the cleaned dataset:\n")
display(train_df_unswapped.head())


    # Current df name = train_unswapped #

Shape after dropping rows with identical headlines: (15469, 8)

Duplicated swapped headlines:



Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label,combined_headline
189,2,Ireland abortion referendum: Country votes in ...,Ireland votes in 'once-in-a-generation' aborti...,2018-05-25,2018-05-25,https://www.cnn.com/2018/05/25/europe/ireland-...,https://www.reuters.com/article/us-ireland-abo...,1,Ireland abortion referendum: Country votes in ...
222,1,Obama cuts short Chelsea Manning's prison sent...,"Mark Zuckerberg, in Suit, Testifies in Oculus ...",2017-01-17,2017-01-18,http://hosted2.ap.org/APDEFAULT/89ae8247abe849...,https://www.nytimes.com/2017/01/17/technology/...,0,"Mark Zuckerberg, in Suit, Testifies in Oculus ..."
307,1,Obama cuts short Chelsea Manning's prison sent...,Obama commutes Chelsea Manning's prison sentence,2017-01-17,2017-01-17,http://hosted2.ap.org/APDEFAULT/89ae8247abe849...,http://www.latimes.com/la-na-obama-manning-201...,1,Obama commutes Chelsea Manning's prison senten...
367,3,Low-pathogenic bird flu strikes Tennessee chic...,"Bird flu strikes Tennessee chickens again, in ...",2017-03-09,2017-03-10,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/us-health-birdf...,1,"Bird flu strikes Tennessee chickens again, in ..."
373,3,U.S. reports low pathogenic bird flu outbreak ...,Low-pathogenic bird flu strikes Tennessee chic...,2017-03-07,2017-03-09,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/us-health-birdf...,0,Low-pathogenic bird flu strikes Tennessee chic...
...,...,...,...,...,...,...,...,...,...
15097,6,"200 arrested, dozens hurt in fresh Tunisia unrest","Dozens hurt, 200 arrested in new Tunisia unrest",2018-01-10,2018-01-10,http://www.france24.com/en/20180110-200-arrest...,http://www.france24.com/en/20180110-tunisia-do...,1,"200 arrested, dozens hurt in fresh Tunisia unr..."
15151,2,Ireland votes in 'once-in-a-generation' aborti...,Voting brisk as Ireland embarks on abortion re...,2018-05-25,2018-05-25,https://www.reuters.com/article/us-ireland-abo...,https://www.reuters.com/article/us-ireland-abo...,1,Ireland votes in 'once-in-a-generation' aborti...
15316,3,US reported low pathogenic H5N2 bird flu in Wi...,U.S. reports low pathogenic bird flu outbreak ...,2017-03-07,2017-03-08,http://www.foxnews.com/health/2017/03/07/us-re...,http://www.reuters.com/article/us-health-birdf...,1,U.S. reports low pathogenic bird flu outbreak ...
15318,2,Ireland heads to polls in landmark abortion vote,Ireland votes in 'once-in-a-generation' aborti...,2018-05-25,2018-05-25,http://www.france24.com/en/20180525-ireland-he...,https://www.reuters.com/article/us-ireland-abo...,1,Ireland heads to polls in landmark abortion vo...




Shape after dropping duplicated swapped headlines: (15286, 9)

The first five lines of the cleaned dataset:



Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label,combined_headline
0,9,Seven bodies found after dam burst at Brazil m...,Fears rise for 300 missing in Brazil dam disas...,2019-01-25,2019-01-26,https://www.reuters.com/article/us-brazil-vale...,https://timesofindia.indiatimes.com/world/rest...,0,Fears rise for 300 missing in Brazil dam disas...
1,9,Dam collapses in Brazil: mud sludge leaves ove...,Brazil dam collapse: At least seven dead and h...,2019-01-26,2019-01-26,http://en.mercopress.com/2019/01/26/dam-collap...,https://www.independent.co.uk/news/world/ameri...,1,Brazil dam collapse: At least seven dead and h...
2,2,Remember Savita': father's plea for voters to ...,Ireland referendum could lift strict ban on ab...,2018-05-23,2018-05-25,https://www.theguardian.com/world/2018/may/23/...,http://www.foxnews.com/world/2018/05/25/irelan...,0,Ireland referendum could lift strict ban on ab...
3,1,Whistleblower Snowden can apply for Russian pa...,Barack Obama commutes sentence of Puerto Rican...,2017-01-18,2017-01-18,http://www.reuters.com/article/us-usa-snowden-...,http://www.independent.co.uk/news/us-president...,0,Barack Obama commutes sentence of Puerto Rican...
4,9,Brazilian despair turns to anger as toll from ...,Brazil dam: Startling pictures of Brumadinho c...,2019-01-28,2019-02-01,https://www.reuters.com/article/us-vale-sa-dis...,https://www.bbc.co.uk/news/world-latin-america...,0,Brazil dam: Startling pictures of Brumadinho c...


In [None]:
# Unique headline combinations

# Count occurrence of all combined headlines
headline_count = train_df_unswapped.groupby(['combined_headline']).size().reset_index(name='count')

# Identify non-unique combined headlines
non_unique_headlines = headline_count[headline_count['count'] > 1]['combined_headline']

print(f"\nShape of filtered DataFrame with non-unique headlines:{non_unique_headlines.shape}\n")

print("The filtered dataframe containing only non-unique headlines:")
display(non_unique_headlines)

# Filter out rows with non-unique combinations
train_df_unique = train_df_unswapped[~train_df_unswapped['combined_headline'].isin(non_unique_headlines)]

# Drop the temporary combined_headline column
train_df_unique = train_df_unique.drop(columns='combined_headline')

# Display the shape of the filtered DataFrame
print(f"\nShape of DataFrame with unique headlines: {train_df_unique.shape}\n\n")

print("The first five lines of the Unique dataset:\n")
display(train_df_unique.head())

    # Current df name = train_df_unique


Shape of filtered DataFrame with non-unique headlines:(0,)

The filtered dataframe containing only non-unique headlines:


Series([], Name: combined_headline, dtype: object)


Shape of DataFrame with unique headlines: (15286, 8)


The first five lines of the Unique dataset:



Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label
0,9,Seven bodies found after dam burst at Brazil m...,Fears rise for 300 missing in Brazil dam disas...,2019-01-25,2019-01-26,https://www.reuters.com/article/us-brazil-vale...,https://timesofindia.indiatimes.com/world/rest...,0
1,9,Dam collapses in Brazil: mud sludge leaves ove...,Brazil dam collapse: At least seven dead and h...,2019-01-26,2019-01-26,http://en.mercopress.com/2019/01/26/dam-collap...,https://www.independent.co.uk/news/world/ameri...,1
2,2,Remember Savita': father's plea for voters to ...,Ireland referendum could lift strict ban on ab...,2018-05-23,2018-05-25,https://www.theguardian.com/world/2018/may/23/...,http://www.foxnews.com/world/2018/05/25/irelan...,0
3,1,Whistleblower Snowden can apply for Russian pa...,Barack Obama commutes sentence of Puerto Rican...,2017-01-18,2017-01-18,http://www.reuters.com/article/us-usa-snowden-...,http://www.independent.co.uk/news/us-president...,0
4,9,Brazilian despair turns to anger as toll from ...,Brazil dam: Startling pictures of Brumadinho c...,2019-01-28,2019-02-01,https://www.reuters.com/article/us-vale-sa-dis...,https://www.bbc.co.uk/news/world-latin-america...,0


In [None]:
# Search for specific entries in DataFrame

filtered_train_df = train_df_unique[(train_df_unique['headline_a'].str.contains('Alabama reports three'))& (train_df_unique['headline_b'].str.contains(''))]
filtered_train_df

Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label
3761,3,Alabama reports three cases of bird flu in pou...,3 cases of bird flu popped up on an Alabama fa...,2017-03-14,2017-03-14,http://www.reuters.com/article/us-health-birdf...,http://www.businessinsider.in/3-cases-of-bird-...,1
7496,3,Alabama reports three cases of bird flu in pou...,Bird flu hits another U.S. farm that supplies ...,2017-03-15,2017-03-17,http://www.reuters.com/article/health-birdflu-...,http://www.reuters.com/article/us-health-birdf...,0
7774,3,Alabama reports three cases of bird flu in pou...,Bird Flu Hits Another Tennessee Chicken Farm L...,2017-03-15,2017-03-16,http://www.reuters.com/article/health-birdflu-...,https://www.bloomberg.com/news/articles/2017-0...,0
10662,3,Alabama reports three cases of bird flu in pou...,Alabama waits for U.S. verdict on bird flu; im...,2017-03-14,2017-03-18,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,0
12352,3,Alabama reports three cases of bird flu in pou...,Step up surveillance to stop bird flu spread f...,2017-03-14,2017-03-18,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/asia-birdflu-id...,0
13427,3,Alabama reports three cases of bird flu in pou...,Tennessee finds 2nd case of highly pathogenic ...,2017-03-14,2017-03-17,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,0
13721,3,Alabama reports three cases of bird flu in pou...,Poultry breeder Aviagen culls U.S. flock over ...,2017-03-14,2017-03-15,http://www.reuters.com/article/us-health-birdf...,http://www.reuters.com/article/health-birdflu-...,0


# Stop Word Removal, Lemmatization, and Tokenisation

In [None]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/a6/0a/0d20d2c0f16be91b9fa32a77b76c60f9baf6eba419e5ef5deca17af9c582/nltk-3.8.1-py3-none-any.whl.metadata
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/c2/c6/023e5b634e5b72034f9e0c36396648e1481f3482c739d1b456b3e5061243/regex-2024.7.24-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading regex-2024.7.24-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
import pandas as pd
# 'pip install nltk' on terminal
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt',quiet=True)
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)

print(train_df_unique.columns)

print(f"The dataframe's confirmed shape: {train_df_unique.shape}")

    # Current df name = train_df_unique

Index(['timeline_id', 'headline_a', 'headline_b', 'date_a', 'date_b', 'url_a',
       'url_b', 'label'],
      dtype='object')
The dataframe's confirmed shape: (15286, 8)


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenise the text
    words = word_tokenize(text)
    # Remove stopwords
    filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    # Lemmatize the filtered words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

# Apply data processing in new DataFrame
train_df_preprocessed = train_df_unique.copy()

# Apply preprocessing to both headline_a and headline_b
train_df_preprocessed['preprocessed_headline_a'] = train_df_preprocessed['headline_a'].apply(preprocess_text)
train_df_preprocessed['preprocessed_headline_b'] = train_df_preprocessed['headline_b'].apply(preprocess_text)

# Display the first few rows to confirm preprocessing
display(train_df_preprocessed.head())

    # Current df name = train_df_preprocessed
    # Updated columns = 'preprocessed_headline_a', 'preprocessed_headline_b'

Unnamed: 0,timeline_id,headline_a,headline_b,date_a,date_b,url_a,url_b,label,preprocessed_headline_a,preprocessed_headline_b
0,9,Seven bodies found after dam burst at Brazil m...,Fears rise for 300 missing in Brazil dam disas...,2019-01-25,2019-01-26,https://www.reuters.com/article/us-brazil-vale...,https://timesofindia.indiatimes.com/world/rest...,0,Seven body found dam burst Brazil mine hundred...,Fears rise 300 missing Brazil dam disaster 9 b...
1,9,Dam collapses in Brazil: mud sludge leaves ove...,Brazil dam collapse: At least seven dead and h...,2019-01-26,2019-01-26,http://en.mercopress.com/2019/01/26/dam-collap...,https://www.independent.co.uk/news/world/ameri...,1,Dam collapse Brazil mud sludge leaf 200 miner ...,Brazil dam collapse least seven dead hundred m...
2,2,Remember Savita': father's plea for voters to ...,Ireland referendum could lift strict ban on ab...,2018-05-23,2018-05-25,https://www.theguardian.com/world/2018/may/23/...,http://www.foxnews.com/world/2018/05/25/irelan...,0,Remember Savita father plea voter end Ireland ...,Ireland referendum could lift strict ban abortion
3,1,Whistleblower Snowden can apply for Russian pa...,Barack Obama commutes sentence of Puerto Rican...,2017-01-18,2017-01-18,http://www.reuters.com/article/us-usa-snowden-...,http://www.independent.co.uk/news/us-president...,0,Whistleblower Snowden apply Russian passport n...,Barack Obama commute sentence Puerto Rican ind...
4,9,Brazilian despair turns to anger as toll from ...,Brazil dam: Startling pictures of Brumadinho c...,2019-01-28,2019-02-01,https://www.reuters.com/article/us-vale-sa-dis...,https://www.bbc.co.uk/news/world-latin-america...,0,Brazilian despair turn anger toll Vale dam dis...,Brazil dam Startling picture Brumadinho collapse


# Testing Vectorisation Techniques

In [None]:
# Use this function to review the data imbalance after each relevant step

def assess_balance(df, name):
    " This function produces information about the current imbalance of the dataset "

    label_counts = df['label'].value_counts()
    majority_class_count = label_counts.iloc[0]
    minority_class_count = label_counts.iloc[1]
    imbalance_ratio = majority_class_count / minority_class_count
    total_count = majority_class_count + minority_class_count
    minority_class_proportion = minority_class_count / total_count
    majority_class_proportion = majority_class_count / total_count

    print(f"Value counts for 'label' column:\n{df['label'].value_counts()}\n\n")

    print(f"\n\nLabel Imbalance Ratio: {imbalance_ratio.round(2)}")
    print(f"\nThe proportion of the {name} dataset belonging to the minority class: {(100 * minority_class_proportion).round(2)}%")
    print(f"\nThe proportion of the {name} dataset belonging to the majority class: {(100 * majority_class_proportion).round(2)}%\n\n")

    label_counts.plot(kind='pie', autopct="%1.1f%%", colors=["lightcoral", "skyblue"])
    plt.title(f'Label Distribution in {name} Dataset')
    plt.ylabel('Count')
    plt.yticks(rotation=45) # not working for pie chart 'Count
    plt.show()

# assess_balance(df, name)

In [None]:
# Confirm number of 'features' in DataFrame

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from itertools import chain

# Combine preprocessed text from both columns
train_df_preprocessed['combined_text'] = train_df_preprocessed['preprocessed_headline_a'] + ' ' + train_df_preprocessed['preprocessed_headline_b']

# Flatten the preprocessed text into a single list
all_words = list(chain.from_iterable(train_df_preprocessed['combined_text'].str.split()))

# Count unique words
unique_words = len(set(all_words))

print("Total number of unique words / Total number of features:", unique_words)

Total number of unique words / Total number of features: 2076


# 1. TF-IDF Vectorisation

Possible hyperparameter tuning: [Hyperparameters tuning using GridSearchCV](https://https://stackoverflow.com/questions/61075012/hyperparameters-tuning-using-gridsearchcv
)

# Applied to RandomForest

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

In [3]:
# Define the pipeline
pipeline = ImbPipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Define the parameters for GridSearch
param_grid = {
    # TF-IDF Vectorizer hyperparameters
    'preprocessor__tfidf_a__max_features': [None, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000],
    'preprocessor__tfidf_a__min_df': [1, 2, 5, 10],
    'preprocessor__tfidf_a__max_df': [0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
    'preprocessor__tfidf_a__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preprocessor__tfidf_a__norm': ['l1', 'l2'],
    'preprocessor__tfidf_a__use_idf': [True, False],
    'preprocessor__tfidf_a__smooth_idf': [True, False],
    'preprocessor__tfidf_a__sublinear_tf': [True, False],

    'preprocessor__tfidf_b__max_features': [None, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000],
    'preprocessor__tfidf_b__min_df': [1, 2, 5, 10],
    'preprocessor__tfidf_b__max_df': [0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
    'preprocessor__tfidf_b__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preprocessor__tfidf_b__norm': ['l1', 'l2'],
    'preprocessor__tfidf_b__use_idf': [True, False],
    'preprocessor__tfidf_b__smooth_idf': [True, False],
    'preprocessor__tfidf_b__sublinear_tf': [True, False],

    # RandomForestClassifier hyperparameters
    'clf__n_estimators': [50, 100, 200, 300, 400],
    'clf__max_depth': [None, 10, 20, 30, 50],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__bootstrap': [True, False],
    'clf__max_features': [None, 'sqrt', 'log2'],
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)

X_train = train_df_preprocessed[['preprocessed_headline_a', 'preprocessed_headline_b']]
y_train = train_df_preprocessed['label']

# Fit grid search
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)
print("Completed Grid Search")

# Output the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)
print(f"Class imbalance after SMOTE: {Counter(y_train)}")

NameError: name 'train_df_preprocessed' is not defined

# Applied to SVM

In [None]:
from sklearn.svm import SVC

# Define the pipeline - replace RandomForest with SVM
pipeline = ImbPipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', SVC(random_state=42))
])

param_grid = {
    # TF-IDF Vectorizer hyperparameters
    'preprocessor__tfidf_a__max_features': [None, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000],
    'preprocessor__tfidf_a__min_df': [1, 2, 5, 10],
    'preprocessor__tfidf_a__max_df': [0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
    'preprocessor__tfidf_a__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preprocessor__tfidf_a__norm': ['l1', 'l2'],
    'preprocessor__tfidf_a__use_idf': [True, False],
    'preprocessor__tfidf_a__smooth_idf': [True, False],
    'preprocessor__tfidf_a__sublinear_tf': [True, False],

    'preprocessor__tfidf_b__max_features': [None, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000],
    'preprocessor__tfidf_b__min_df': [1, 2, 5, 10],
    'preprocessor__tfidf_b__max_df': [0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
    'preprocessor__tfidf_b__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preprocessor__tfidf_b__norm': ['l1', 'l2'],
    'preprocessor__tfidf_b__use_idf': [True, False],
    'preprocessor__tfidf_b__smooth_idf': [True, False],
    'preprocessor__tfidf_b__sublinear_tf': [True, False],

    # SVM hyperparameters
    'clf__C': [0.1, 1, 10, 100],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)

# Perform grid search
X_train = train_df_preprocessed[['preprocessed_headline_a', 'preprocessed_headline_b']]
y_train = train_df_preprocessed['label']

print("Starting Grid Search...")
grid_search.fit(X_train, y_train)
print("Completed Grid Search")

print(grid_search.best_params_)
print(grid_search.best_score_)
print(f"Class imbalance after SMOTE: {Counter(y_train)}")

# Applied to a Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

# Replace SVM with MLPClassifier
pipeline = ImbPipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', MLPClassifier(random_state=42, max_iter=1000))
])

parameters = {
    # TF-IDF Vectorisation
    'preprocessor__tfidf_a__max_features': [None, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000],
    'preprocessor__tfidf_a__min_df': [1, 2, 5, 10],
    'preprocessor__tfidf_a__max_df': [0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
    'preprocessor__tfidf_a__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preprocessor__tfidf_a__norm': ['l1', 'l2'],
    'preprocessor__tfidf_a__use_idf': [True, False],
    'preprocessor__tfidf_a__smooth_idf': [True, False],
    'preprocessor__tfidf_a__sublinear_tf': [True, False],

    # Neural Network hyperparameters
    'clf__hidden_layer_sizes': [(50, 50), (100,), (100,50 )],
    'clf__activation': ['relu', 'tanh', 'logistic'],
    'clf__solver': ['adam', 'sgd', 'lbfgs'],
    'clf__alpha': [0.0001, 0.001, 0.01],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)

# Perform grid search
X_train = train_df_preprocessed[['preprocessed_headline_a', 'preprocessed_headline_b']]
y_train = train_df_preprocessed['label']

print("Starting Grid Search...")
grid_search.fit(X_train, y_train)
print("Completed Grid Search")

print(grid_search.best_params_)
print(grid_search.best_score_)
print(f"Class imbalance after SMOTE: {Counter(y_train)}")

# 2. BERT Vectorisation

Possible ranges for Hyperparameter Tuning: [How to Generate Word Embedding using BERT?](https://https://www.geeksforgeeks.org/how-to-generate-word-embedding-using-bert/)

# Applied to RandomForest

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

In [None]:
# Define BERT function
def bert_embedding(texts):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

# Prepare data for BERT embedding
X_headline_a = bert_embedding(train_df_preprocessed['preprocessed_headline_a'].tolist())
X_headline_b = bert_embedding(train_df_preprocessed['preprocessed_headline_b'].tolist())
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)


# Define and fit Random Forest model
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
parameters = {
    'clf__n_estimators': [50, 100, 200, 300, 400],
    'clf__max_depth': [None, 10, 20, 30, 50],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__bootstrap': [True, False],
    'clf__max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(rf_classifier, param_grid=parameters, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(grid_search.best_params_)
print(grid_search.best_score_)

# Applied to SVM

In [None]:
# Refer to prior BERT function - def bert_embedding(texts):

# Prepare data for BERT embedding
X_headline_a = bert_embedding(train_df_preprocessed['preprocessed_headline_a'].tolist())
X_headline_b = bert_embedding(train_df_preprocessed['preprocessed_headline_b'].tolist())
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)


# Reintroduce SVM model
svm_classifier = SVC(random_state=42)

parameters = {
    # SVM hyperparameters
    'clf__C': [0.1, 1, 10, 100],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(svm_classifier, param_grid=parameters, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(grid_search.best_params_)
print(grid_search.best_score_)

# Applied to a Neural Network

In [None]:
# Refer to prior BERT function - def bert_embedding(texts):

X_headline_a = bert_embedding(train_df_preprocessed['preprocessed_headline_a'].tolist())
X_headline_b = bert_embedding(train_df_preprocessed['preprocessed_headline_b'].tolist())
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Reintroduce Neural Network
mlp_classifier = MLPClassifier(random_state=42, max_iter=1000)

parameters = {
    # Neural Network hyperparameters
    'clf__hidden_layer_sizes': [(50, 50), (100,), (100,50 )],
    'clf__activation': ['relu', 'tanh', 'logistic'],
    'clf__solver': ['adam', 'sgd', 'lbfgs'],
    'clf__alpha': [0.0001, 0.001, 0.01],
}

grid_search = GridSearchCV(mlp_classifier, param_grid=parameters, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(grid_search.best_params_)
print(grid_search.best_score_)

# 3. Word2Vec Vectorisation

Possible Hyperparameter tuning: [Optimal hyperparameters and their impact on natural language processing downstream tasks](https://https://www.degruyter.com/document/doi/10.1515/comp-2022-0236/html?lang=en)

# Applied to RandomForest

In [None]:
from gensim.models import Word2Vec

In [None]:
# Train Word2Vec model
headlines = [text.split() for text in train_df_preprocessed['preprocessed_headline_a']] + \
            [text.split() for text in train_df_preprocessed['preprocessed_headline_b']]


w2v_model = Word2Vec(sentences=headlines, vector_size=100, window=5, min_count=1, workers=4)

def word2vec_embedding(texts):
    return np.array([np.mean([w2v_model.wv[word] for word in text.split() if word in w2v_model.wv]
                             or [np.zeros(100)], axis=0) for text in texts])

X_headline_a = word2vec_embedding(train_df_preprocessed['preprocessed_headline_a'])
X_headline_b = word2vec_embedding(train_df_preprocessed['preprocessed_headline_b'])
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Define RandomForest
rf_classifier = RandomForestClassifier(random_state=42)

parameters = {
    'clf__n_estimators': [50, 100, 200, 300, 400],
    'clf__max_depth': [None, 10, 20, 30, 50],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__bootstrap': [True, False],
    'clf__max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(rf_classifier, param_grid=parameters, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)


# Applied to SVM

In [None]:
# Refer to function - def word2vec_embedding(texts):

X_headline_a = word2vec_embedding(train_df_preprocessed['preprocessed_headline_a'])
X_headline_b = word2vec_embedding(train_df_preprocessed['preprocessed_headline_b'])
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

svm_classifier = SVC(random_state=42)

parameters = {
    # SVM hyperparameters
    'clf__C': [0.1, 1, 10, 100],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(svm_classifier, param_grid=parameters, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(grid_search.best_params_)
print(grid_search.best_score_)

# Applied to a Neural Network

In [None]:
X_headline_a = word2vec_embedding(train_df_preprocessed['preprocessed_headline_a'])
X_headline_b = word2vec_embedding(train_df_preprocessed['preprocessed_headline_b'])
X_train = np.concatenate([X_headline_a, X_headline_b], axis=1)

y_train = train_df_preprocessed['label']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

mlp_classifier = MLPClassifier(random_state=42, max_iter=1000)

parameters = {
    # Neural Network hyperparameters
    'clf__hidden_layer_sizes': [(50, 50), (100,), (100,50 )],
    'clf__activation': ['relu', 'tanh', 'logistic'],
    'clf__solver': ['adam', 'sgd', 'lbfgs'],
    'clf__alpha': [0.0001, 0.001, 0.01],
}

grid_search = GridSearchCV(mlp_classifier, param_grid=parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

print(grid_search.best_params_)
print(grid_search.best_score_)