In [1]:
import importlib
import utilityFunctions
importlib.reload(utilityFunctions)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'utilityFunctions' from '/workspaces/mlops-fake-news-prediction/utilityFunctions.py'>

In [2]:
import pandas as pd
import boto3
import io

from sklearn.model_selection import train_test_split

from utilityFunctions import clean_data, prepare_features, apply_text_cleaner, remove_outliers, load_file_s3, upload_to_s3

Load raw dataset

In [3]:
df = load_file_s3('fake-news-prediction', 'datasets/WELFake_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


Remove duplicates so that they do not end up in the validation and test data set

In [4]:
df = df.drop_duplicates(subset='text').reset_index().drop(columns='index')
df = df.drop_duplicates(subset='title').reset_index().drop(columns='index')

Perform train-, val- and test-split

In [5]:
X = df.drop(columns='label')
y = df.loc[:, 'label']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train, 
                                                    test_size=0.25, 
                                                    random_state=42)

clean data: drop unused column 'Unnamed: 0', reverse labels into: fake=1, real=0, fill Nan-values with an empty string

prepare features: Create new features
- title_text: concatenate columns title and text
- text_word_count: # of words in text column
- title_word_count: # of words in title column
- text_unique_words: # of uniqe words in text column
- text_char_count: # of characters in text column
- title_char_count: # of characters in title column
- avg_word_length: average word length in text column
- sentence_count: # of sentences in text column
- special_char_count: # of special characters in text column
- language: estimated language for text column

In [6]:
X_train, y_train = clean_data(X_train, y_train)
X_val, y_val = clean_data(X_val, y_val)
X_test, y_test = clean_data(X_test, y_test)

X_train = prepare_features(X_train)
X_val = prepare_features(X_val)
X_test = prepare_features(X_test)

Apply text cleaner: Using NLP-Steps include Lemmatization, removing stop words, removing punctuations and substitute multiple spaces or dots to single space. Returns new column 'title_text_clean' to DataFrame with the cleaned text.

In [7]:
X_train = apply_text_cleaner(X_train, column='title_text')
X_val = apply_text_cleaner(X_val, column='title_text')
X_test = apply_text_cleaner(X_test, column='title_text')

Progress: 100%|██████████| 36840/36840 [46:29<00:00, 13.21it/s]  
Progress: 100%|██████████| 12280/12280 [15:21<00:00, 13.33it/s]
Progress: 100%|██████████| 12281/12281 [16:04<00:00, 12.73it/s]


In [8]:
# path = 'X_train.parquet'
# with open(path, 'wb',) as file:
    # df.to_parquet(file, index=False)

# print(f"CSV file was successfully saved under: {path}")

Upload to s3

In [9]:
upload_to_s3(X_train, 'fake-news-prediction', "datasets/X_train.parquet")
upload_to_s3(X_val, 'fake-news-prediction', "datasets/X_val.parquet")
upload_to_s3(X_test, 'fake-news-prediction', "datasets/X_test.parquet")

File saved as datasets/X_train.parquet in fake-news-prediction.
File saved as datasets/X_val.parquet in fake-news-prediction.
File saved as datasets/X_test.parquet in fake-news-prediction.


In [10]:
upload_to_s3(y_train, 'fake-news-prediction', "datasets/y_train.csv")
upload_to_s3(y_val, 'fake-news-prediction', "datasets/y_val.csv")
upload_to_s3(y_test, 'fake-news-prediction', "datasets/y_test.csv")

File saved as datasets/y_train.csv in fake-news-prediction.
File saved as datasets/y_val.csv in fake-news-prediction.
File saved as datasets/y_test.csv in fake-news-prediction.


Removes outliers from a DataFrame using the Median Absolute Deviation (MAD).

The threshold value for MAD (default: 3).

iterations : Number of iterations to remove outliers step by step (default: 1).

In [11]:
# X_train_without_outliers, y_train_without_outliers = remove_outliers(X_train, y_train, num_cols, threshold=3, iterations=1)