In [17]:
import pandas as pd
from tqdm import tqdm
from time import sleep
import wget
import os
import gc
import gzip

In [18]:
PATH = 'https://data.commoncrawl.org/'
LANG = "hu"
CC_LANG = "hun"
INDEX_FILENAME = "hungarian.csv"

In [19]:
urls = pd.read_csv('cc-index-table.paths',header=None)

In [20]:
def wget_file(path, current_file=None):
    try:
        if current_file is not None:
            wget.download(path, out=current_file)
        else:
            wget.download(path)
        return True
    except:
        print('Error downloading', path)
        return False

In [21]:
for i in tqdm(range(2)):
    current_file = 'current_index.gz.parquet'
    filename = urls[0][i]
    url = PATH + filename
    if wget_file(url, current_file):
        print(f"Downloaded {filename}")
        df = []
        gc.collect()
        df = pd.read_parquet(current_file)  
        df = df[df['content_languages'].str.startswith('hu', na=False)]
        if (i == 0):
            df.to_csv('hungarian.csv', header=True, index=False)
        else:
            df.to_csv('hungarian.csv', mode='a', header=False, index=False)
        print(f"Saved {len(df)} rows to file")
        os.remove(current_file)
        print("Removed current file")

  0%|          | 0/2 [00:00<?, ?it/s]

Downloaded cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-49/subset=warc/part-00000-a0906200-461b-4808-9b94-6c53daf73f61.c000.gz.parquet


 50%|█████     | 1/2 [01:59<01:59, 119.03s/it]

Saved 77895 rows to file
Removed current file
Downloaded cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-49/subset=warc/part-00001-a0906200-461b-4808-9b94-6c53daf73f61.c000.gz.parquet


100%|██████████| 2/2 [04:12<00:00, 126.03s/it]

Saved 77895 rows to file
Removed current file





In [22]:
hungarian_index = pd.read_csv('hungarian.csv', low_memory=False)
hungarian_sorted = hungarian_index.groupby(['warc_filename'])['url_host_name'].count().reset_index(name='count').sort_values(['count'], ascending=False)
hungarian_sorted.head(5)

Unnamed: 0,warc_filename,count
27004,crawl-data/CC-MAIN-2022-49/segments/1669446710...,14
17787,crawl-data/CC-MAIN-2022-49/segments/1669446710...,14
27487,crawl-data/CC-MAIN-2022-49/segments/1669446710...,14
8759,crawl-data/CC-MAIN-2022-49/segments/1669446710...,14
9302,crawl-data/CC-MAIN-2022-49/segments/1669446710...,14


In [40]:
def warc_2_wet(base, filename):
    filename = filename.replace('/warc/', '/wet/')
    filename = filename.replace('.warc.gz', '.warc.wet.gz')
    return base + filename

def get_hungarian_from_wet(filecontent, db, target_lang=CC_LANG):
    idx = 0
    contents = []
    can_save = True
    while idx < len(filecontent):
        if filecontent[idx].startswith("WARC-Target-URI:"):
            current_url = filecontent[idx].split(" ")[-1]
            if current_url in db:
                can_save = False
            else:
                can_save = True
                db.add(current_url)
        if can_save and filecontent[idx].startswith("WARC-Identified-Content-Language:") and target_lang in filecontent[idx]:
            # print(filecontent[idx+1])
            if "Content-Type: text/plain" in filecontent[idx+1]:
                # print(filecontent[idx])
                idx = idx + 3
                while not (filecontent[idx] == "" and filecontent[idx+1] == "" ) :
                    if len(filecontent[idx]) > 60:
                        contents.append(filecontent[idx])
                    idx +=1
        idx += 1

    return contents, db

temp_file_name = "current_wet_tmp.warc.wet.gz"
text_buffer = []
current_count = 0
total_count = 0
file_index = 0
db = set()
        
for i in tqdm(range(2)):
    filename = hungarian_sorted["warc_filename"][i]
    url = warc_2_wet(PATH, filename)
    print(f"Downloading {url}")
    if wget_file(url, temp_file_name):
        print(f"Downloaded {filename}")
        with gzip.open(temp_file_name, 'rb') as f:
            current_file_content = f.read().decode('utf-8').splitlines()
        print(f"Read {len(current_file_content)} lines")
        hungarian_text, db = get_hungarian_from_wet(current_file_content, db,)
        print(f"Found {len(hungarian_text)} lines")

        text_buffer.extend(hungarian_text)
        total_count += len(hungarian_text)
        current_count += len(hungarian_text)
        if current_count > 10000:
            text_buffer = list(set(text_buffer))
            # write to file
            with open(f"hungarian_text_{file_index}.txt", "w", encoding="utf-8") as f:
                f.write("\n".join(text_buffer))
            file_index += 1
            current_count = 0
            text_buffer = []
            print(f"Saved in total: {total_count} lines to files")
        
        gc.collect()
        # os.remove(temp_file_name)
        print(f"Removed {temp_file_name}")

  0%|          | 0/2 [00:00<?, ?it/s]

Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/wet/CC-MAIN-20221126080725-20221126110725-00000.warc.wet.gz
Downloaded crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/warc/CC-MAIN-20221126080725-20221126110725-00000.warc.gz
Read 6977626 lines
Found 15584 lines
Saved in total: 15584 lines to files


 50%|█████     | 1/2 [00:05<00:05,  5.53s/it]

Removed current_wet_tmp.warc.wet.gz
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/wet/CC-MAIN-20221126080725-20221126110725-00001.warc.wet.gz
Downloaded crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/warc/CC-MAIN-20221126080725-20221126110725-00001.warc.gz
Read 6977626 lines


100%|██████████| 2/2 [00:10<00:00,  5.25s/it]

Found 1484 lines
Removed current_wet_tmp.warc.wet.gz





In [41]:
len(hungarian_sorted)

50872