In [None]:
import gdown
import zipfile
import os
import pandas as pd
import glob
import shutil
import re
import string
import numpy as np

In [2]:
def download_and_process_zip_files(file_id_dict):
    """
    Download .zip files from Google Drive, extract them, combine CSV data.
    Adds a 'source_file' column to track the original zip filename.

    Params:
        file_id_dict: dict
            Dictionary in the form { 'filename.zip': 'google_drive_file_id' }

    Returns:
        combined_df: pandas.DataFrame
    """
    print("Starting download and extraction...\n")
    all_dfs = []

    for zip_filename, file_id in file_id_dict.items():
        # Download
        url = f'https://drive.google.com/uc?id={file_id}'
        gdown.download(url, zip_filename, quiet=False)

        # Extract
        extract_dir = zip_filename.replace('.zip', '')
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        os.remove(zip_filename)

        # Read CSV(s)
        csv_files = glob.glob(f'{extract_dir}/**/*.csv', recursive=True)

        for csv_file in csv_files:
            df = pd.read_csv(csv_file)
            if 'Unnamed: 0' in df.columns:
                df.drop(columns=['Unnamed: 0'], inplace=True)
            df['label'] = zip_filename
            all_dfs.append(df)

        # Clean up
        shutil.rmtree(extract_dir)

    # Combine all data
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print("Combine completed!\n")

    return combined_df


In [3]:
def assign_label(df, fake_keyword="FAKE"):
    """
    Assign label based on whether 'label' contains a keyword.

    Params:
        df: pandas.DataFrame - must have 'label' column
        fake_keyword: str - keyword to detect fake samples (default: "FAKE")

    Returns:
        df with 'label' column
    """
    df['label'] = df['label'].apply(
        lambda name: 0 if fake_keyword.upper() in name.upper() else 1
    )
    return df

In [None]:
def preprocess_text(df: pd.DataFrame, column_name: str) -> pd.Series:
    """
    Clean a text column by removing URLs, HTML, emojis, punctuation, and digits.

    Params:
        df: pandas.DataFrame - the input DataFrame
        column_name: str - name of the column to clean

    Returns:
        A new Series of cleaned text
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile(r'<[^>]+>')
    emoji_pattern = re.compile(
        "["  
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", flags=re.UNICODE
    )
    punct_digit_table = str.maketrans(
        {ch: " " for ch in string.punctuation + string.digits}
    )

    def clean(text: str) -> str:
        text = url_pattern.sub(" ", str(text))
        text = html_pattern.sub(" ", text)
        text = text.translate(punct_digit_table)
        text = emoji_pattern.sub(" ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text.lower()

    return df[column_name].apply(lambda x: clean(x) if pd.notnull(x) else np.nan)


In [5]:
file_ids = {
    'misinfo_FAKE.zip': '1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh',
    'misinfo_TRUE.zip': '1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD'
}

# Download and combine data
df = download_and_process_zip_files(file_ids)

# Assign label
df_labeled = assign_label(df)

df_labeled.head()


Starting download and extraction...



Downloading...
From (original): https://drive.google.com/uc?id=1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh
From (redirected): https://drive.google.com/uc?id=1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh&confirm=t&uuid=217b06e2-ec64-4cfd-92a1-f4d872162513
To: /Users/hoaho/Study/HCMUS/HCMUS-projects/NLP-final-project/misinfo_FAKE.zip
100%|██████████| 45.1M/45.1M [00:01<00:00, 27.2MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD
From (redirected): https://drive.google.com/uc?id=1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD&confirm=t&uuid=790eb748-7fd6-409c-8fce-0ab7d7a1c083
To: /Users/hoaho/Study/HCMUS/HCMUS-projects/NLP-final-project/misinfo_TRUE.zip
100%|██████████| 43.0M/43.0M [00:01<00:00, 28.5MB/s]


Combine completed!



Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [7]:
df_processed = df_labeled.copy()
df_processed['text_cleaned'] = preprocess_text(df_processed, 'text')
df_processed = df_processed[['text_cleaned', 'label']]
df_processed.head()

Unnamed: 0,text_cleaned,label
0,donald trump just couldn t wish all americans ...,0
1,house intelligence committee chairman devin nu...,0
2,on friday it was revealed that former milwauke...,0
3,on christmas day donald trump announced that h...,0
4,pope francis used his annual christmas day mes...,0


In [8]:
print(df_processed.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78617 entries, 0 to 78616
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text_cleaned  78617 non-null  object
 1   label         78617 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.2+ MB
None


In [9]:
print(df_processed.describe())

              label
count  78617.000000
mean       0.444878
std        0.496955
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000


In [14]:
print("Missing values per column:")
print(df_processed.isnull().sum())

Missing values per column:
text_cleaned    0
label           0
dtype: int64


In [11]:
print("Label distribution:")
print(df_processed['label'].value_counts())

Label distribution:
label
0    43642
1    34975
Name: count, dtype: int64


In [12]:
csv_filename = "misinfo_processed.csv"
zip_filename = "misinfo_processed.zip"

# Save temporary csv file
df.to_csv(csv_filename, index=False)

# Compress the CSV file into a .zip archive
with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename)

# Remove the temporary CSV file
os.remove(csv_filename)

In [None]:
print(df_labeled[df_labeled['label'] == 1]['text'].isnull().sum())

29
