
# Cora citation dataset exploratory data analysis
In this document, we will explore the structure and the content of the dataset, and attempt to formulate a way to best exploit the data for a classificaiton task.

## Download and extract Cora dataset

In [None]:
import requests
import tarfile
import os
import pandas as pd

# Step 1: Download the .tgz file
url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz'  # Replace with the actual URL
output_path = 'cora.tgz'

# Download the file
response = requests.get(url)
with open(output_path, 'wb') as f:
    f.write(response.content)
print(f"File downloaded: {output_path}")

# Step 2: Extract the .tgz file
extract_folder = 'data'
os.makedirs(extract_folder, exist_ok=True)

with tarfile.open(output_path, 'r:gz') as tar_ref:
    tar_ref.extractall(extract_folder)
print(f"File extracted to: {extract_folder}")
os.remove(output_path)

## loading and sampling the citation csv

In [None]:
import networkx as nx
data_dir = f"{os.getcwd()}/{extract_folder}/cora"
edgelist = pd.read_csv(os.path.join(data_dir, "cora.cites"), sep='\t', header=None, names=["target", "source"])
edgelist["label"] = "cites"

print("Citations between source and target articles")
edgelist.sample(frac=1).head(5)


## Loading and sampling the word content CSV 

In [None]:
feature_names = ["w_{}".format(ii) for ii in range(1433)]
node_data = pd.read_csv(os.path.join(data_dir, "cora.content"), sep='\t', header=None)
node_data.columns = ["index"] + ["w_{}".format(i) for i, __ in enumerate(node_data.columns[1:-1])] + ["subject"]
node_data = node_data.set_index("index")

print("""Table showing presence/absence(1/0) of a given word in the article.
For example, W_0 corresponds to the index of a specific technical term.
Finally, the subject of the paper represented by the row is available in the final column""")
node_data.head(5)