# Processing Files as Parquet Files

In [None]:
import os
import pandas as pd
import gzip

# Function to read a large JSON file in chunks and yield DataFrames
def read_large_json(file_path, chunksize=10000):
    with gzip.open(file_path, "rb") as file:
        reader = pd.read_json(file, lines=True, chunksize=chunksize)
        for chunk in reader:
            yield chunk

# Process each file in the folder
folder_path = "D:/GhArchive/January_2023/files"
parquet_folder = "D:/GhArchive/January_2023/parquet_files"

if not os.path.exists(parquet_folder):
    os.makedirs(parquet_folder)

for file_name in os.listdir(folder_path):
    if file_name.endswith(".json.gz"):
        file_path = os.path.join(folder_path, file_name)

        parquet_file_path = os.path.join(parquet_folder, f"{os.path.splitext(file_name)[0]}.parquet")

        # Check if the corresponding parquet file already exists
        if os.path.exists(parquet_file_path):
            print(f"Skipping file {file_name} as it has already been processed.")
            continue

        print(f"Processing file: {file_name}")

        data = pd.DataFrame()
        for chunk in read_large_json(file_path):
            data = pd.concat([data, chunk], ignore_index=True)

        # Save the data as a .parquet file
        data.to_parquet(parquet_file_path)

Skipping file gharchive-2023-01-01-00.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-01.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-02.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-03.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-04.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-05.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-06.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-07.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-08.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-09.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-10.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-11.json.gz as it has already been processed.
Skipping file gharchive-2023-01-01-12.js

#### To save disk space, we directly process the file .gz into parquet file without extracting it as .json file