In [1]:
import csv
from collections import defaultdict
from itertools import combinations
import ast
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import coo_matrix, csr_matrix

def build_entity_cooccurrence_matrix(input_file, stopwords_file, output_file, abbreviations_file):
    with open(stopwords_file, 'r') as f:
        custom_stopwords = f.read().splitlines()
    stop_words = set(stopwords.words('english') + custom_stopwords)
    lemmatizer = WordNetLemmatizer()
    # Read abbreviations file
    with open(abbreviations_file, 'r') as f:
        abbreviations = dict(line.strip().split(',') for line in f)
    # Read csv file
    df = pd.read_csv(input_file)
    # Check if named_entities column exists
    if 'named_entities' not in df.columns:
        raise ValueError("named_entities column not found in input file")
    # Build co-occurrence matrix for each file
    file_cooccurrence_matrices = defaultdict(lambda: defaultdict(int))
    total_files = len(df['filename'].unique())
    for i, (filename, group) in enumerate(df.groupby('filename')):
        # Process each entity in each row
        for row in group.itertuples():
            entities = []
            for entity in ast.literal_eval(row.named_entities):
                # Remove stopwords, generic entities, and special characters
                if entity['entity_group'] != 'Generic' and entity['word'] not in stop_words:
                    # Lemmatize entity and tokenize multiple words
                    word = lemmatizer.lemmatize(entity['word'], pos='n')
                    words = word_tokenize(word.lower())
                    words = [w for w in words if w.isalnum()]
                    entity_str = ' '.join(words)
                    # Expand abbreviation if present
                    if entity_str in abbreviations:
                        entity_str = abbreviations[entity_str]
                    # Add entity type to the entity string
                    entity_str += f" ({entity['entity_group']})"
                    entities.append(entity_str)
            # Update co-occurrence count for each pair of entities
            for entity1, entity2 in combinations(entities, 2):
                if entity1 == entity2:
                    continue
                sorted_entities = tuple(sorted([entity1, entity2]))
                file_cooccurrence_matrices[filename][sorted_entities] += 1
        # Print progress every 10% completion
        percent_complete = (i + 1) / total_files * 100
        if percent_complete % 10 == 0:
            print(f"Processing {filename} ({percent_complete:.0f}% complete)...")
    # Write entity co-occurrence count to csv file
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'entity1', 'entity2', 'cooccurrence', 'entity1_type', 'entity2_type'])
        for filename, entity_cooccurrence_matrix in file_cooccurrence_matrices.items():
            for entities, cooccurrence in entity_cooccurrence_matrix.items():
                entity1, entity2 = entities
                entity1_type = entity1.split(' ')[-1][1:-1]
                entity2_type = entity2.split(' ')[-1][1:-1]
                entity1 = ' '.join(entity1.split(' ')[:-1])
                entity2 = ' '.join(entity2.split(' ')[:-1])
                writer.writerow([filename, entity1, entity2, cooccurrence, entity1_type, entity2_type])
    # Read csv file and output top 10 co-occurring entities per file
    df = pd.read_csv(output_file)
    df_sorted = df.sort_values(['filename', 'cooccurrence'], ascending=[True, False])
    print(df_sorted.groupby('filename').head(10))

In [2]:
#pip install nltk

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/featurize/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/featurize/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/featurize/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
input_file = 'iclr_output.csv'
stopwords_file = 'stopwords.txt'
output_file = 'co_ent_iclr.csv'
abbreviations_file = 'abbreviations_file.txt'
build_entity_cooccurrence_matrix(input_file, stopwords_file, output_file, abbreviations_file)

Processing Uuf2q9TfXGA.xml (50% complete)...
Processing zzqBoIFOQ1.xml (100% complete)...
               filename                 entity1            entity2  \
782      -0tPmzgXS5.xml            ghost motion                 gm   
947      -0tPmzgXS5.xml                      gm        randaugment   
865      -0tPmzgXS5.xml                      gm  video recognition   
41       -0tPmzgXS5.xml             overfitting              video   
832      -0tPmzgXS5.xml  generalization ability                 gm   
...                 ...                     ...                ...   
6749757  zzqBoIFOQ1.xml                csc mbpo              spice   
6749535  zzqBoIFOQ1.xml       environment model              spice   
6749815  zzqBoIFOQ1.xml       environment model             policy   
6750258  zzqBoIFOQ1.xml          safety horizon  safety violations   
6748876  zzqBoIFOQ1.xml         safety analysis              spice   

         cooccurrence         entity1_type         entity2_type  
782

In [5]:
import csv

with open('co_ent_arxiv.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # 跳过第一行表头
    filtered_rows = [row for row in reader if int(row[2]) >= 10]
    row_count = len(filtered_rows)

    print(f'The CSV file has {row_count} rows after filtering.')



The CSV file has 22537 rows after filtering.


In [6]:
import pandas as pd

# Read entity co-occurrence count from csv file
df = pd.read_csv(output_file)

# Sort by cooccurrence in descending order and output top 20 rows
df_sorted = df.sort_values('cooccurrence', ascending=False)
print(df_sorted.head(20))

                filename               entity1                      entity2  \
1944352  HcUf-QwZeFh.xml               iboetvq                        sfbdi   
1077709  8aHzds2uUyB.xml                   col                       header   
5942909  rnFOPhTMB0Y.xml                 adamw  stochastic gradient descent   
1944351  HcUf-QwZeFh.xml                 sfbdi                        upvdi   
3092561  SNONkz5zEUF.xml    federated learning                           sl   
3194165   TMYzh1hsHd.xml                   iql                        ma2ql   
3458964  VWm4o4l3V9e.xml                  bsfp                         msfp   
2741805  P8YIphWNEGO.xml  graph neural network                      peermlp   
5943194  rnFOPhTMB0Y.xml          freeze embed  stochastic gradient descent   
1944355  HcUf-QwZeFh.xml               iboetvq                        upvdi   
3555684  WZH7099tgfM.xml                  jump                   turn right   
4105745   azCKuYyS74.xml                    cl      

In [None]:
import pandas as pd

# Read entity co-occurrence count from csv file
df = pd.read_csv('co_ent_arxiv.csv')

# Extract entity types from entity1_type and entity2_type columns
entity_types = set(df['entity1_group']).union(set(df['entity2_group']))

# Create a dictionary to store the count of each entity type
entity_type_counts = {}

# Count number of entities for each type
for entity_type in entity_types:
    entity_type_counts[entity_type] = df[(df['entity1_group'] == entity_type) | (df['entity2_group'] == entity_type)]['entity1'].nunique() + df[(df['entity1_group'] == entity_type) | (df['entity2_group'] == entity_type)]['entity2'].nunique()

# Print entity type counts
for entity_type, count in entity_type_counts.items():
    print(f"{entity_type}: {count}")