In [2]:
# read and organize data
import pandas as pd
import pickle

# Define a function to read and process the file
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Initialize variables to store data
    papers = []
    current_paper = {}
    references = []

    # Process each line in the file
    for line in lines:
        if line.startswith('#*'):
            current_paper['title'] = line[2:].strip()
        elif line.startswith('#@'):
            current_paper['authors'] = line[2:].strip().split(',')
        elif line.startswith('#t'):
            current_paper['year'] = int(line[2:].strip())
        elif line.startswith('#c'):
            current_paper['venue'] = line[2:].strip()
        elif line.startswith('#index'):
            current_paper['index_id'] = int(line[6:].strip())
        elif line.startswith('#%'):
            references.append(int(line[2:].strip()))
        elif line.startswith('#!'):
            current_paper['abstract'] = line[2:].strip()
        elif line.strip() == '' and current_paper:
            current_paper['references'] = references
            papers.append(current_paper)
            current_paper = {}
            references = []

    # Handle the last paper
    if current_paper:
        current_paper['references'] = references
        papers.append(current_paper)

    return pd.DataFrame(papers)

def save_to_file(df, file_path):
    df.to_csv(file_path, index=False)


    
input_file_path = 'raw_data.txt'  
output_file_path = 'data.csv'  
df = read_data(input_file_path)
save_to_file(df, output_file_path)
with open('data.pkl', 'wb') as file:
    pickle.dump(df, file)


## Construct Network

In [4]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import ast


# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges from the DataFrame
for row in range(df.shape[0]):
    r = df.loc[row]
    G.add_node(r['index_id'], title=r['title'])  # Add paper as node
    for cited in r['references']: # convert string back to list
        G.add_edge(r['index_id'], cited)  # Add citation edge



In [6]:
# Save the graph in GraphML format
nx.write_graphml(G, 'Cleaned Data and Graph/citation_network.graphml')

# Save the graph in GEXF format
nx.write_gexf(G, 'Cleaned Data and Graph/citation_network.gexf')


In [7]:
import itertools
import pandas as pd
import ast


if isinstance(df['authors'].iloc[0], str):
    df['authors'] = df['authors'].apply(ast.literal_eval)


def build_author_network(df):
    """
    Build a network of authors, where two authors are connected if they co-authored a paper.

    df: DataFrame with author and paper information.
    return: A NetworkX graph representing the author network.
    """
    G = nx.Graph()

    for authors in df['authors']:
        # Create edges between all pairs of co-authors
        for author1, author2 in itertools.combinations(authors, 2):
            if G.has_edge(author1, author2):
                # Increase weight if authors have co-authored multiple papers
                G[author1][author2]['weight'] += 1
            else:
                # Add new edge with weight 1
                G.add_edge(author1, author2, weight=1)

    return G

# Example 
author_network = build_author_network(df)
nx.write_graphml(author_network, 'Cleaned Data and Graph/author_network.graphml')

# Save the graph in GEXF format
nx.write_gexf(author_network, 'Cleaned Data and Graph/author_network.gexf')
