In [55]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import Counter
from datetime import datetime, timedelta
import ast

In [37]:
def load_data(file_path):
    return pd.read_csv("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv", low_memory=False)

In [68]:
def build_temporal_graph(data):
    G = nx.DiGraph()
    for _, row in data.iterrows():
        citing_doi = row['URL']
        references = row['reference'].split(';') if pd.notna(row['reference']) else []     
        year = int(row['earliest_pub_year']) 
  
        if pd.notna(year):
            G.add_node(citing_doi, time=datetime(year,1,1))
        
        for ref in references:
            G.add_edge(citing_doi, ref)

    return G

In [58]:
def validate_temporal_graph(graph):
    invalid_nodes = []

    for node, data in graph.nodes(data=True):
        if 'time' not in data or not isinstance(data['time'], datetime):
            invalid_nodes.append(node)

    if invalid_nodes:
        print(f"Invalid nodes detected: {len(invalid_nodes)}")
        for node in invalid_nodes[:5]:  # Print a sample
            print(f"Node {node} is missing a valid time attribute.")
        return False

    print("All nodes have valid time attributes.")
    return True

In [59]:
def calculate_cd_index(graph):
    cd_index_per_node = {}
    delta = timedelta(days=365 * 25)

    for node in graph.nodes:
            try:
                    cd_index = nx.cd_index(graph, node, time_delta=delta, time="earliest_pub_year")
                    cd_index_per_node[node] = cd_index
            except Exception as e:
                cd_index_per_node[node] = None

    return cd_index_per_node 

In [65]:
def remove_invalid_nodes(graph):
    invalid_nodes = [node for node, attrs in graph.nodes(data=True) if 'time' not in attrs or attrs['time'] is None]
    graph.remove_nodes_from(invalid_nodes)
    print(f"Removed {len(invalid_nodes)} invalid nodes.")
    return graph

In [69]:
def main(file_path):
    data = load_data(file_path)

    graph = build_temporal_graph(data)

    cd_indices = calculate_cd_index(graph)

    cd_index_df = pd.DataFrame(list(cd_indices.items()), columns=['DOI', 'CD_Index'])

    graph = remove_invalid_nodes(graph)

    
    return cd_index_df

In [7]:
def part(file_path):
    data = pd.read_csv(file_path)
    
    # Display available columns
    print("Columns in the dataset:", data.columns)
    
    # Expected column name for references
    expected_col = "reference"  
    
    # Validate the column name
    if expected_col not in data.columns:
        raise KeyError(f"Column '{expected_col}' not found. Available columns: {list(data.columns)}")
    
    # Proceed with sampling
    sampled_data = data.sample(n=1000, random_state=42)
    print(sampled_data.head())
part("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv")

  data = pd.read_csv(file_path)


Columns in the dataset: Index(['created', 'license', 'publisher', 'published-online', 'author', 'page',
       'indexed', 'special_numbering', 'assertion', 'editor', 'relation',
       'score', 'issued', 'volume', 'source', 'update-policy', 'updated-by',
       'deposited', 'archive', 'is-referenced-by-count', 'alternative-id',
       'prefix', 'DOI', 'language', 'abstract', 'resource', 'member', 'issue',
       'link', 'ISSN', 'content-domain', 'published-print', 'type',
       'short-container-title', 'journal-issue', 'references-count',
       'subtitle', 'published-other', 'original-title', 'reference-count',
       'published', 'title', 'container-title', 'reference', 'funder',
       'issn-type', 'article-number', 'URL', 'cleaned_abstract',
       'earliest_pub_year'],
      dtype='object')
                                                  created  \
27710   {'date-parts': [[2006, 3, 3]], 'date-time': '2...   
317879  {'date-parts': [[2022, 12, 6]], 'date-time': '...   
182709  {

In [70]:
type = main("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv")


Removed 914149 invalid nodes.


In [71]:
print(type)

                                                       DOI CD_Index
0                         https://doi.org/10.1038/35023137     None
1        [{'key': 'BF35023137_CR1', 'doi-asserted-by': ...     None
2        2', 'volume': '80', 'author': 'GL Vourlitis', ...     None
3        2', 'volume': '78', 'author': 'RB McKane', 'ye...     None
4                         https://doi.org/10.1038/35015767     None
...                                                    ...      ...
1242159  [{'doi-asserted-by': 'publisher', 'key': 'e_1_...     None
1242160       https://doi.org/10.1016/j.lanepe.2023.100701     None
1242161  [{'key': '10.1016/j.lanepe.2023.100701_bib1', ...     None
1242162  https://doi.org/10.24272/j.issn.2095-8137.2022...     None
1242163                 https://doi.org/10.1503/cjs.014223     None

[1242164 rows x 2 columns]


In [None]:

type.to_csv("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\cd_index.csv", index=False) 