In [7]:
import json
import csv
import os
import pandas as pd

### Json file to CSV (Interactions)

In [3]:
# Define file paths
json_file_path = "goodreads_interactions_children.json"
output_folder = "Chunks"
final_csv_path = "interactions_data_file.csv"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process large JSON file in chunks
chunk_size = 50000  # Number of records per chunk
chunk_count = 0
header_written = False
part_files = []  # List to store chunk file names

with open(json_file_path, "r", encoding="utf-8") as json_file:
    data_buffer = []
    for line in json_file:
        line = line.strip()
        if line:
            try:
                data_buffer.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping invalid line: {line}")

        # When buffer reaches chunk size, write to CSV
        if len(data_buffer) >= chunk_size:
            chunk_count += 1
            part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
            part_files.append(part_file)

            # Write data to CSV
            try:
                with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
                    writer = csv.writer(csv_file)
                    if not header_written:
                        writer.writerow(data_buffer[0].keys())  # Write header once
                        header_written = True

                    for item in data_buffer:
                        writer.writerow(item.values())

                print(f"Saved chunk {chunk_count} to {part_file}")

            finally:
                data_buffer.clear()  # Clear buffer only after successful write
                header_written = False; # let the next chunk file has the header

# Save remaining data in the buffer (last chunk)
if data_buffer:
    chunk_count += 1
    part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
    part_files.append(part_file)

    with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(data_buffer[0].keys())
        for item in data_buffer:
            writer.writerow(item.values())

    print(f"Saved final chunk {chunk_count} to {part_file}")

# Merge all CSV chunks into one final CSV file
with open(final_csv_path, "w", newline="", encoding="utf-8") as final_csv:
    writer = csv.writer(final_csv)
    header_written = False

    for part_file in part_files:
        with open(part_file, "r", encoding="utf-8") as part_csv:
            reader = csv.reader(part_csv)
            header = next(reader)
            if not header_written:
                writer.writerow(header)  # Write header only once
                header_written = True
            for row in reader:
                writer.writerow(row)

print(f"Final merged CSV saved to {final_csv_path}")

Saved chunk 1 to Chunks\part_1.csv
Saved chunk 2 to Chunks\part_2.csv
Saved chunk 3 to Chunks\part_3.csv
Saved chunk 4 to Chunks\part_4.csv
Saved chunk 5 to Chunks\part_5.csv
Saved chunk 6 to Chunks\part_6.csv
Saved chunk 7 to Chunks\part_7.csv
Saved chunk 8 to Chunks\part_8.csv
Saved chunk 9 to Chunks\part_9.csv
Saved chunk 10 to Chunks\part_10.csv
Saved chunk 11 to Chunks\part_11.csv
Saved chunk 12 to Chunks\part_12.csv
Saved chunk 13 to Chunks\part_13.csv
Saved chunk 14 to Chunks\part_14.csv
Saved chunk 15 to Chunks\part_15.csv
Saved chunk 16 to Chunks\part_16.csv
Saved chunk 17 to Chunks\part_17.csv
Saved chunk 18 to Chunks\part_18.csv
Saved chunk 19 to Chunks\part_19.csv
Saved chunk 20 to Chunks\part_20.csv
Saved chunk 21 to Chunks\part_21.csv
Saved chunk 22 to Chunks\part_22.csv
Saved chunk 23 to Chunks\part_23.csv
Saved chunk 24 to Chunks\part_24.csv
Saved chunk 25 to Chunks\part_25.csv
Saved chunk 26 to Chunks\part_26.csv
Saved chunk 27 to Chunks\part_27.csv
Saved chunk 28 to C

In [5]:
# Define the file path
csv_file_path = "interactions_data_file.csv"  # Update with your actual path

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Count the total number of rows
total_rows = len(df)

print(f"Total number of rows: {total_rows}")

Total number of rows: 10059349


### Json File to CSV (Reviews)

In [9]:
# Define file paths
json_file_path = "goodreads_reviews_children.json"
output_folder = "Chunks_Reviews"
final_csv_path = "reviews_data_file.csv"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process large JSON file in chunks
chunk_size = 50000  # Number of records per chunk
chunk_count = 0
header_written = False
part_files = []  # List to store chunk file names

with open(json_file_path, "r", encoding="utf-8") as json_file:
    data_buffer = []
    for line in json_file:
        line = line.strip()
        if line:
            try:
                data_buffer.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping invalid line: {line}")

        # When buffer reaches chunk size, write to CSV
        if len(data_buffer) >= chunk_size:
            chunk_count += 1
            part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
            part_files.append(part_file)

            # Write data to CSV
            try:
                with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
                    writer = csv.writer(csv_file)
                    if not header_written:
                        writer.writerow(data_buffer[0].keys())  # Write header once
                        header_written = True

                    for item in data_buffer:
                        writer.writerow(item.values())

                print(f"Saved chunk {chunk_count} to {part_file}")

            finally:
                data_buffer.clear()  # Clear buffer only after successful write
                header_written = False; # let the next chunk file has the header

# Save remaining data in the buffer (last chunk)
if data_buffer:
    chunk_count += 1
    part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
    part_files.append(part_file)

    with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(data_buffer[0].keys())
        for item in data_buffer:
            writer.writerow(item.values())

    print(f"Saved final chunk {chunk_count} to {part_file}")

# Merge all CSV chunks into one final CSV file
with open(final_csv_path, "w", newline="", encoding="utf-8") as final_csv:
    writer = csv.writer(final_csv)
    header_written = False

    for part_file in part_files:
        with open(part_file, "r", encoding="utf-8") as part_csv:
            reader = csv.reader(part_csv)
            header = next(reader)
            if not header_written:
                writer.writerow(header)  # Write header only once
                header_written = True
            for row in reader:
                writer.writerow(row)

print(f"Final merged CSV saved to {final_csv_path}")

Saved chunk 1 to Chunks_Reviews\part_1.csv
Saved chunk 2 to Chunks_Reviews\part_2.csv
Saved chunk 3 to Chunks_Reviews\part_3.csv
Saved chunk 4 to Chunks_Reviews\part_4.csv
Saved chunk 5 to Chunks_Reviews\part_5.csv
Saved chunk 6 to Chunks_Reviews\part_6.csv
Saved chunk 7 to Chunks_Reviews\part_7.csv
Saved chunk 8 to Chunks_Reviews\part_8.csv
Saved chunk 9 to Chunks_Reviews\part_9.csv
Saved chunk 10 to Chunks_Reviews\part_10.csv
Saved chunk 11 to Chunks_Reviews\part_11.csv
Saved chunk 12 to Chunks_Reviews\part_12.csv
Saved chunk 13 to Chunks_Reviews\part_13.csv
Saved chunk 14 to Chunks_Reviews\part_14.csv
Saved final chunk 15 to Chunks_Reviews\part_15.csv
Final merged CSV saved to reviews_data_file.csv


In [11]:
# Define the file path
csv_file_path = "reviews_data_file.csv"  # Update with your actual path

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Count the total number of rows
total_rows = len(df)

print(f"Total number of rows: {total_rows}")

Total number of rows: 734640


### Json file to CSV (Books)

In [15]:
# Define file paths
json_file_path = "goodreads_books_children.json"
output_folder = "Chunks_Books"
final_csv_path = "books_data_file.csv"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process large JSON file in chunks
chunk_size = 50000  # Number of records per chunk
chunk_count = 0
header_written = False
part_files = []  # List to store chunk file names

with open(json_file_path, "r", encoding="utf-8") as json_file:
    data_buffer = []
    for line in json_file:
        line = line.strip()
        if line:
            try:
                data_buffer.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping invalid line: {line}")

        # When buffer reaches chunk size, write to CSV
        if len(data_buffer) >= chunk_size:
            chunk_count += 1
            part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
            part_files.append(part_file)

            # Write data to CSV
            try:
                with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
                    writer = csv.writer(csv_file)
                    if not header_written:
                        writer.writerow(data_buffer[0].keys())  # Write header once
                        header_written = True

                    for item in data_buffer:
                        writer.writerow(item.values())

                print(f"Saved chunk {chunk_count} to {part_file}")

            finally:
                data_buffer.clear()  # Clear buffer only after successful write
                header_written = False; # let the next chunk file has the header

# Save remaining data in the buffer (last chunk)
if data_buffer:
    chunk_count += 1
    part_file = os.path.join(output_folder, f"part_{chunk_count}.csv")
    part_files.append(part_file)

    with open(part_file, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(data_buffer[0].keys())
        for item in data_buffer:
            writer.writerow(item.values())

    print(f"Saved final chunk {chunk_count} to {part_file}")

# Merge all CSV chunks into one final CSV file
with open(final_csv_path, "w", newline="", encoding="utf-8") as final_csv:
    writer = csv.writer(final_csv)
    header_written = False

    for part_file in part_files:
        with open(part_file, "r", encoding="utf-8") as part_csv:
            reader = csv.reader(part_csv)
            header = next(reader)
            if not header_written:
                writer.writerow(header)  # Write header only once
                header_written = True
            for row in reader:
                writer.writerow(row)

print(f"Final merged CSV saved to {final_csv_path}")

Saved chunk 1 to Chunks_Books\part_1.csv
Saved chunk 2 to Chunks_Books\part_2.csv
Saved final chunk 3 to Chunks_Books\part_3.csv
Final merged CSV saved to books_data_file.csv


In [17]:
# Define the file path
csv_file_path = "books_data_file.csv"  # Update with your actual path

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Count the total number of rows
total_rows = len(df)

print(f"Total number of rows: {total_rows}")

Total number of rows: 124082
