# Imports

In [9]:
import json
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import csv

# Load the JSON files

In [10]:
with open('data/filtered-papers-with-abstracts.json', 'r') as f:
    papers_data = json.load(f)

with open('data/pwc/links-between-papers-and-code.json', 'r') as f:
    links_data = json.load(f)

In [11]:
papers_data[0]

{'paper_url': 'https://paperswithcode.com/paper/dynamic-network-model-from-partial',
 'arxiv_id': '1805.10616',
 'title': 'Dynamic Network Model from Partial Observations',
 'abstract': 'Can evolving networks be inferred and modeled without directly observing\ntheir nodes and edges? In many applications, the edges of a dynamic network\nmight not be observed, but one can observe the dynamics of stochastic cascading\nprocesses (e.g., information diffusion, virus propagation) occurring over the\nunobserved network. While there have been efforts to infer networks based on\nsuch data, providing a generative probabilistic model that is able to identify\nthe underlying time-varying network remains an open question. Here we consider\nthe problem of inferring generative dynamic network models based on network\ncascade diffusion data. We propose a novel framework for providing a\nnon-parametric dynamic network model--based on a mixture of coupled\nhierarchical Dirichlet processes-- based on data

# Function to get README from GitHub

In [12]:
# Function to fetch the raw README file directly from the GitHub repository
def fetch_raw_readme(repo_url):
    try:
        # Construct the base URL for the raw files
        repo_name = repo_url.replace("https://github.com/", "")
        
        # Possible README filenames with different capitalizations
        possible_readme_files = [
            "README.md", "Readme.md", "readme.md", 
            "README.MD", "ReadMe.md", "readMe.md", 
            "README", "Readme", "readme"
        ]

        for readme_file in possible_readme_files:
            raw_readme_url = f"https://raw.githubusercontent.com/{repo_name}/main/{readme_file}"
            response = requests.get(raw_readme_url)
            
            # Check for successful response
            if response.status_code == 200:
                return response.text

            # If "main" branch doesn't exist, try the "master" branch
            raw_readme_url = f"https://raw.githubusercontent.com/{repo_name}/master/{readme_file}"
            response = requests.get(raw_readme_url)
            if response.status_code == 200:
                return response.text
        
        print(f"README not found for {repo_url}")
        return None

    except Exception as e:
        print(f"Error fetching README from {repo_url}: {e}")
        return None

# Merge data

In [13]:
# Create a list to store the merged data
merged_data = []

# Create a dictionary to map GitHub repos to papers using the paper URL
github_links_dict = {link['paper_url']: link['repo_url'] for link in links_data}

In [15]:
# Process each paper in the papers_with_abstracts.json file
for paper in papers_data:
    paper_url = paper['paper_url']
    
    # Extract the relevant fields
    paper_title = paper.get('title', '')
    abstract = paper.get('abstract', '')
    github_link = github_links_dict.get(paper_url, 'No GitHub link available')

    main_collection_area = None
    for method in paper['methods']:
        if 'main_collection' in method:
            if method['main_collection'] and 'area' in method['main_collection']:
                main_collection_area = method['main_collection']['area']
                break

    # Try to scrape the README content from the GitHub repository
    #readme_content = None
    #if github_link != 'No GitHub link available':
    #    readme_content = fetch_raw_readme(github_link)
    #    if readme_content is None:
    #        readme_content = 'README not available'

    # Add the merged data to the list
    merged_data.append({
        'paper_title': paper_title,
        'abstract': abstract,
        'main_collection_area': main_collection_area,
        'github_repo': github_link,
        #'github_readme_content': readme_content,
    })

# Save the data

In [18]:
df = pd.DataFrame(merged_data)

json_filename = 'paper_title_abstract.json'

data_dict = df.to_dict(orient='records')
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(data_dict, f, indent=4)