In [1]:
import os
import gzip
import subprocess
import io
import shutil
import threading
from queue import Queue
from pathlib import Path
import re

In [2]:
# Function for parsing and reading chuncks of the .WET data
def read_warc_record(reader):
    ret = {"header": "", "body": b""}

    first_line = reader.readline().decode("utf-8")

    if first_line != "WARC/1.0\r\n":
        raise ValueError(f"warc version expected 'WARC/1.0' found {first_line}")

    warc_header_builder = []
    content_length = -1

    # Iterate through lines until \r\n is reached
    for line in iter(reader.readline, b"\r\n"):
        # Decode each line to utf-8 format
        line = line.decode("utf-8")

        # Break if end of block
        if line == "\r\n":
            break
        
        # If you reach a line that starts with content-length
        if line.lower().startswith("content-length:"):
            if content_length > 0:
                raise ValueError("exactly one content-length should be present in a WARC header")
            key, value = line.split(":", 1)
            str_value = value.strip()
            # You save the number next to content length and read the number in the body part with reader.read(content_length)
            content_length = int(str_value)

        warc_header_builder.append(line)

    if content_length == 0:
        return {"header": "".join(warc_header_builder), "body": b""}

    
    body = reader.read(content_length).decode("utf-8")
    return {"header": "".join(warc_header_builder), "body": body}

In [3]:
# use read_warc_record to iterate through file and write files based on language
def classify_files(path):
    try:
        with open(path, "rb") as infile:
            buf_in = io.BufferedReader(infile)

            for record in iter(lambda: read_warc_record(buf_in), {"header": "", "body": b""}):

                input_string = record['header']
                # Use regular expression to extract the value after "Identified-Content-Language:"
                match = re.search(r'WARC-Identified-Content-Language:\s*(\w+)', input_string)

                if match:
                    identified_language = match.group(1)

                    with open(f'./clean_data/{identified_language}.txt', 'a') as file:
                        file.write(str(record['body']))
                else:
                    pass
                
                # blank.append(record)
                buf_in.readline()
                buf_in.readline()

    except ValueError as e:
        print("End of file")

In [4]:
# Remove duplicate lines within the new text files we grouped by languages
def remove_duplicates_inplace(file_path):
    unique_lines = set()

    # Read the file and collect unique lines
    with open(file_path, 'r') as file:
        lines = file.readlines()
        unique_lines.update(line.strip() for line in lines)

    # Write unique lines back to the file
    with open(file_path, 'w') as file:
        file.write('\n'.join(unique_lines))

In [5]:
import os

def get_filenames(folder_path):
    filenames = []
    for filename in os.listdir(folder_path):
        full_path = os.path.join(folder_path, filename)
        if os.path.isfile(full_path):
            filenames.append(filename)
    return filenames

In [6]:
import os

def get_folder_size(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)
    return total_size

def format_size(size):
    # Convert bytes to a human-readable format (e.g., KB, MB, GB)
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size < 1024.0:
            return f"{size:.2f} {unit}"
        size /= 1024.0

In [16]:
filenames = get_filenames('./data/')

for i in filenames: 
    classify_files(f'./data/{i}')
    clean_data_dir = get_filenames('./clean_data/')
    for filename in clean_data_dir:
        remove_duplicates_inplace(f'./clean_data/{filename}')

End of file


In [13]:
folder_path = './clean_data/'  # Replace with your folder path
folder_size = get_folder_size(folder_path)
print(f'The total size of the folder "{folder_path}" is: {format_size(folder_size)}')

The total size of the folder "./clean_data/" is: 528.38 MB


In [14]:
filenames = get_filenames('./clean_data/')
for filename in filenames:
    remove_duplicates_inplace(f'./clean_data/{filename}')

In [15]:
folder_path = './clean_data/'  # Replace with your folder path
folder_size = get_folder_size(folder_path)
print(f'The total size of the folder "{folder_path}" is: {format_size(folder_size)}')

The total size of the folder "./clean_data/" is: 228.15 MB
