In [55]:
!pip install pandas
!pip install tiktoken

In [None]:
# Filter out the number of papers

import json

input_file_path = 'peersum_all.json'
output_file_path = 'output_file.json'

def extract_and_write_first_5001_rows(input_file_path, output_file_path):
    extracted_data = []
    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            try:
                json_data = json.loads(line)
                extracted_data.append(json_data)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON: {line.strip()}")
    
    first_5001_rows = extracted_data[:1700]
    
    with open(output_file_path, 'w') as output_file:
        json.dump(first_5001_rows, output_file, indent=4)

extract_and_write_first_5001_rows(input_file_path, output_file_path)

print(f"The first 5001 rows have been extracted and written to '{output_file_path}'.")

In [35]:
# Converting to CSV for better readability
import json
import csv

with open('output_file.json', 'r') as json_file:
    data = json.load(json_file)

filtered_reviews = []

for item in data:
    paper_id = item['paper_id']
    paper_title = item['paper_title']
    paper_acceptance = item['paper_acceptance']
    reviews = item['reviews']
    
    
    for review in reviews:
        if review['writer'] == 'official_reviewer':
            review_id = review['review_id']
            comment = review['comment'].replace('\n', ' ')
            rating = review['rating']
            confidence = review['confidence']
            writer = review['writer']
            
            
            
            filtered_reviews.append([paper_id, paper_title, paper_acceptance, review_id, comment, rating, confidence, writer])

with open('filtered_reviews.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file, escapechar='\\')
    
    csv_writer.writerow(['paper_id', 'paper_title', 'paper_acceptance', 'review_id', 'comment', 'rating', 'confidence', 'writer'])
    
    csv_writer.writerows(filtered_reviews)

print("CSV file has been created successfully.")

CSV file has been created successfully.


In [37]:
# Filtering out the non official comments

import pandas as pd

df = pd.read_csv('filtered_reviews.csv')

filtered_df = df[~((df['confidence'] == 1) & (df['rating'] == 1)) &
                 ~((df['confidence'] == 1) & (df['rating'] == -1)) &
                 ~((df['confidence'] == -1) & (df['rating'] == 1)) &
                 ~((df['confidence'] == -1) & (df['rating'] == -1))]

filtered_df.to_csv('final_reviews.csv', index=False)

print("Filtered data has been saved to 'final_reviews.csv'")

Filtered data has been saved to 'final_reviews.csv'


In [38]:
#meta reviews generation

import json
import csv

with open('output_file.json', 'r') as json_file:
    data = json.load(json_file)

csv_file_name = 'meta_reviews.csv'

with open(csv_file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    csv_writer.writerow(['paper_id', 'meta_review'])

    for row in data:
        paper_id = row.get('paper_id', '')
        meta_review = row.get('meta_review', '')
        csv_writer.writerow([paper_id, meta_review])

print(f'CSV file "{csv_file_name}" created successfully.')

CSV file "meta_reviews.csv" created successfully.


In [39]:
#converting it back to Json
import pandas as pd
import json

filtered_reviews = pd.read_csv('filtered_reviews.csv')
meta_reviews = pd.read_csv('meta_reviews.csv')

merged_data = pd.merge(filtered_reviews, meta_reviews, on='paper_id')

json_data = {}

for index, row in merged_data.iterrows():
    paper_id = row['paper_id']
    if paper_id not in json_data:
        json_data[paper_id] = {
            'paper_title': row['paper_title'],
            'paper_acceptance': row['paper_acceptance'],
            'meta_review': row['meta_review'],
            'reviews': []
        }
    
    if row['rating'] != -1 and row['confidence'] != -1:
        review = {
            'review_id': row['review_id'],
            'comment': row['comment'],
            'rating': row['rating'],
            'confidence': row['confidence'],
            'writer': row['writer']
        }
        json_data[paper_id]['reviews'].append(review)

json_output = json.dumps(list(json_data.values()), indent=2)

with open('output.json', 'w') as json_file:
    json_file.write(json_output)

In [51]:
#token count
import tiktoken
import pandas as pd
import json

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string using GPT-4 tokenization."""
    num_tokens = len(enc.encode(string))
    return num_tokens

def num_tokens_from_json(json_file_path: str) -> int:
    """Returns the total number of tokens in 'meta_review' and 'comment' fields of a JSON file."""
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    total_tokens = 0

    for entry in data:
        meta_review_tokens = num_tokens_from_string(entry['meta_review'])
        total_tokens += meta_review_tokens

        for review in entry['reviews']:
            comment_tokens = num_tokens_from_string(review['comment'])
            total_tokens += comment_tokens

    return total_tokens

json_file_path = 'output.json'

total_tokens_in_json = num_tokens_from_json(json_file_path)
print(f"Total tokens in the JSON file: {total_tokens_in_json}")

Total tokens in the 'comment' column: 2872196


In [53]:
import json

with open('output.json', 'r') as json_file:
    data = json.load(json_file)

total_reviews = 0

for paper in data:
    total_reviews += len(paper['reviews'])

print(f'Total number of reviews based on review_id: {total_reviews}')

Total number of reviews based on review_id: 5166
