In [2]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import Counter

In [3]:
def load_data(file_path):
    return pd.read_csv("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv", low_memory=False)

In [4]:
def build_temporal_graph(data):
    G = nx.DiGraph()

    for _, row in data.iterrows():
        citing_doi = row['URL']
        references = row['reference'].split(';') if pd.notna(row['reference']) else []
        year = row['earliest_pub_year']

        G.add_node(citing_doi, year=year)

        for ref in references:
             G.add_node(ref)
             G.add_edge(citing_doi, ref)

    print("Number of nodes:", G.number_of_nodes())
    print("Number of edges:", G.number_of_edges())

    return G

In [5]:
def calculate_shannon_diversity(neighbors, graph):
    years = [graph.nodes[neighbor]['year'] for neighbor in neighbors if 'year' in graph.nodes[neighbor]]
    counts = Counter(years)
    total = sum(counts.values())
    if total == 0:
        return 0
    diversity = -sum((count / total) * np.log(count / total) for count in counts.values() if count > 0)
    return diversity

In [6]:
def calculate_cd_index(graph):
    cd_index_per_node = {}
    for node in graph.nodes:
        neighbors = list(graph.successors(node))  # Directed graph: successors are the neighbors
        if neighbors:  # Skip nodes with no outgoing edges
            diversity = calculate_shannon_diversity(neighbors, graph)
            cd_index_per_node[node] = diversity
        else:
            cd_index_per_node[node] = 0  # No diversity for isolated nodes
    return cd_index_per_node

In [7]:
def main(file_path):
    data = load_data(file_path)

    graph = build_temporal_graph(data)

    cd_indices = calculate_cd_index(graph)

    cd_index_df = pd.DataFrame(cd_indices.items(), columns=['DOI', 'CD_Index'])
    
    return cd_index_df

In [10]:
df = pd.read_csv("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv", low_memory=False)
filtered_df = df[df['reference'].notnull()]
filtered_df = filtered_df[df['URL'].notnull()]
sample = filtered_df.sample(n=100, random_state=1)
sample

  filtered_df = filtered_df[df['URL'].notnull()]


Unnamed: 0,created,license,publisher,published-online,author,page,indexed,special_numbering,assertion,editor,...,published,title,container-title,reference,funder,issn-type,article-number,URL,cleaned_abstract,earliest_pub_year
136749,"{'date-parts': [[2015, 2, 10]], 'date-time': '...","[{'start': {'date-parts': [[2015, 2, 10]], 'da...",MDPI AG,"{'date-parts': [[2015, 2, 10]]}","[{'given': 'Chang', 'family': 'Cheong', 'seque...",1336-1352,"{'date-parts': [[2024, 6, 3]], 'date-time': '2...",,,,...,"{'date-parts': [[2015, 2, 10]]}",['Lifecycle CO2 Reduction by Implementing Doub...,['Energies'],"[{'key': 'ref_1', 'unstructured': 'Laustsen, J...",,"[{'value': '1996-1073', 'type': 'electronic'}]",,https://doi.org/10.3390/en8021336,This study investigated lifecycle CO2 (LCCO2) ...,2015
197333,"{'date-parts': [[2018, 9, 25]], 'date-time': '...","[{'start': {'date-parts': [[2018, 9, 25]], 'da...",Springer Science and Business Media LLC,"{'date-parts': [[2018, 9, 25]]}","[{'given': 'Nicola C.', 'family': 'Newton', 's...",,"{'date-parts': [[2024, 7, 10]], 'date-time': '...",,"[{'value': '15 May 2018', 'order': 1, 'name': ...",,...,"{'date-parts': [[2018, 9, 25]]}",['Universal cannabis outcomes from the Climate...,"['Substance Abuse Treatment, Prevention, and P...","[{'key': '171_CR1', 'unstructured': 'European ...","[{'DOI': '10.13039/501100000925', 'name': 'Nat...","[{'value': '1747-597X', 'type': 'electronic'}]",34,https://doi.org/10.1186/s13011-018-0171-4,,2018
253153,"{'date-parts': [[2021, 12, 4]], 'date-time': '...","[{'start': {'date-parts': [[2021, 12, 4]], 'da...",Springer Science and Business Media LLC,"{'date-parts': [[2021, 12, 4]]}","[{'given': 'Yang', 'family': 'Zhang', 'sequenc...",321-331,"{'date-parts': [[2024, 7, 30]], 'date-time': '...",,"[{'value': '28 August 2021', 'order': 1, 'name...",,...,"{'date-parts': [[2021, 12, 4]]}",['Comparison of Long-Term Effects After Modifi...,['Ophthalmology and Therapy'],"[{'issue': '4', 'key': '413_CR1', 'doi-asserte...",,"[{'value': '2193-8245', 'type': 'print'}, {'va...",,https://doi.org/10.1007/s40123-021-00413-7,,2021
188113,"{'date-parts': [[2018, 10, 17]], 'date-time': ...","[{'start': {'date-parts': [[2018, 10, 17]], 'd...",MDPI AG,"{'date-parts': [[2018, 10, 17]]}",[{'ORCID': 'http://orcid.org/0000-0002-6511-28...,2790,"{'date-parts': [[2024, 7, 9]], 'date-time': '2...",,,,...,"{'date-parts': [[2018, 10, 17]]}",['Thermal and Lighting Consumption Savings in ...,['Energies'],"[{'key': 'ref_1', 'unstructured': '(2018, July...",,"[{'value': '1996-1073', 'type': 'electronic'}]",,https://doi.org/10.3390/en11102790,Most educational buildings in southern Spain d...,2018
13830,"{'date-parts': [[2016, 3, 22]], 'date-time': '...","[{'start': {'date-parts': [[2004, 9, 1]], 'dat...",SAGE Publications,"{'date-parts': [[2004, 9, 1]]}","[{'given': 'Andrew T.', 'family': 'Roach', 'se...",10-17,"{'date-parts': [[2024, 9, 23]], 'date-time': '...",,,,...,"{'date-parts': [[2004, 9]]}",['Evaluating School Climate and School Culture'],['TEACHING Exceptional Children'],[{'volume-title': 'The eleventh mental measure...,,"[{'type': 'print', 'value': '0040-0599'}, {'ty...",,https://doi.org/10.1177/004005990403700101,,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320952,"{'date-parts': [[2023, 12, 19]], 'date-time': ...","[{'start': {'date-parts': [[2023, 12, 19]], 'd...",MDPI AG,"{'date-parts': [[2023, 12, 19]]}","[{'given': 'Joanna', 'family': 'Struzewska', '...",9,"{'date-parts': [[2024, 5, 23]], 'date-time': '...",,,,...,"{'date-parts': [[2023, 12, 19]]}",['Changes in Temperature and Precipitation Tre...,['Applied Sciences'],"[{'key': 'ref_1', 'doi-asserted-by': 'crossref...","[{'name': 'Cohesion Fund, Operational Programm...","[{'value': '2076-3417', 'type': 'electronic'}]",,https://doi.org/10.3390/app14010009,This study presents the potential impacts of c...,2023
191299,"{'date-parts': [[2018, 9, 27]], 'date-time': '...","[{'start': {'date-parts': [[2018, 9, 27]], 'da...",Springer Science and Business Media LLC,"{'date-parts': [[2018, 9, 27]]}","[{'given': 'Abdelghani', 'family': 'Bekhira', ...",,"{'date-parts': [[2024, 6, 10]], 'date-time': '...",,"[{'value': '17 January 2018', 'order': 1, 'nam...",,...,"{'date-parts': [[2018, 9, 27]]}",['Hydrological modeling of floods in the Wadi ...,['Applied Water Science'],"[{'key': '834_CR1', 'unstructured': 'ANRH (Nat...",,"[{'value': '2190-5487', 'type': 'print'}, {'va...",185,https://doi.org/10.1007/s13201-018-0834-3,,2018
239793,"{'date-parts': [[2019, 12, 26]], 'date-time': ...","[{'start': {'date-parts': [[2020, 3, 1]], 'dat...",Elsevier BV,,"[{'given': 'Gil', 'family': 'Lemos', 'sequence...",103109,"{'date-parts': [[2024, 10, 6]], 'date-time': '...",C,"[{'value': 'Elsevier', 'name': 'publisher', 'l...",,...,"{'date-parts': [[2020, 3]]}",['On the need of bias correction methods for w...,['Global and Planetary Change'],[{'key': '10.1016/j.gloplacha.2019.103109_bb00...,"[{'name': 'EarthSystems Doctoral School'}, {'D...","[{'value': '0921-8181', 'type': 'print'}]",103109,https://doi.org/10.1016/j.gloplacha.2019.103109,,2020
136043,"{'date-parts': [[2015, 5, 20]], 'date-time': '...",,Informa UK Limited,"{'date-parts': [[2015, 5, 20]]}","[{'given': 'Jeryl L.', 'family': 'Mumpower', '...",798-809,"{'date-parts': [[2024, 9, 13]], 'date-time': '...",,[{'value': 'The publishing and review policy f...,,...,"{'date-parts': [[2015, 5, 20]]}",['Predictors of the perceived risk of climate ...,['Journal of Risk Research'],"[{'key': 'CIT0001', 'doi-asserted-by': 'publis...",,"[{'value': '1366-9877', 'type': 'print'}, {'va...",,https://doi.org/10.1080/13669877.2015.1043567,,2015


In [8]:
def part(file_path):
    data = pd.read_csv(file_path)
    
    # Display available columns
    print("Columns in the dataset:", data.columns)
    
    # Expected column name for references
    expected_col = "reference"  
    
    # Validate the column name
    if expected_col not in data.columns:
        raise KeyError(f"Column '{expected_col}' not found. Available columns: {list(data.columns)}")
    
    # Proceed with sampling
    sampled_data = data.sample(n=1000, random_state=42)
    print(sampled_data.head())
part("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv")

  data = pd.read_csv(file_path)


Columns in the dataset: Index(['created', 'license', 'publisher', 'published-online', 'author', 'page',
       'indexed', 'special_numbering', 'assertion', 'editor', 'relation',
       'score', 'issued', 'volume', 'source', 'update-policy', 'updated-by',
       'deposited', 'archive', 'is-referenced-by-count', 'alternative-id',
       'prefix', 'DOI', 'language', 'abstract', 'resource', 'member', 'issue',
       'link', 'ISSN', 'content-domain', 'published-print', 'type',
       'short-container-title', 'journal-issue', 'references-count',
       'subtitle', 'published-other', 'original-title', 'reference-count',
       'published', 'title', 'container-title', 'reference', 'funder',
       'issn-type', 'article-number', 'URL', 'cleaned_abstract',
       'earliest_pub_year'],
      dtype='object')
                                                  created  \
27710   {'date-parts': [[2006, 3, 3]], 'date-time': '2...   
317879  {'date-parts': [[2022, 12, 6]], 'date-time': '...   
182709  {

In [9]:
type = main("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\climate_articles_unique_english.csv")
type.to_csv("\\Users\\joshu\\Downloads\\UROP_Code\\Data\\cd_index.csv", index=False) 

Number of nodes: 1242164
Number of edges: 976327
