In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/make-data-count-finding-data-references/sample_submission.csv
/kaggle/input/make-data-count-finding-data-references/train_labels.csv
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_ece3.5260.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_chem.201902131.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_ece3.3985.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_ejoc.202000916.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_ece3.6144.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_2017jc013030.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_anie.201916483.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_chem.202001412.xml
/kaggle/input/make-data-count-finding-data-references/test/XML/10.1002_chem.202003167.xml
/kaggle/input/make-data-count-finding-data-references/test/X

In [2]:
import pandas as pd
import re
from pathlib import Path
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle

# --- Paths and Constants ---
DATA_DIR = Path('/kaggle/input/make-data-count-finding-data-references')
TRAIN_DIR = DATA_DIR / 'train'
TEST_DIR = DATA_DIR / 'test'

# --- 1. Data Extraction and Preprocessing ---

def get_xml_text(xml_path):
    """Extracts text from an XML file."""
    try:
        with open(xml_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'lxml-xml')
            text_parts = soup.find_all(['p', 'abstract', 'sec', 'title', 'ref-list'])
            full_text = ' '.join(p.get_text() for p in text_parts)
            return full_text
    except Exception:
        return ""

def get_article_doi(xml_path):
    """Extracts the article DOI from an XML file."""
    try:
        with open(xml_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'lxml-xml')
            doi_tag = soup.find('article-id', {'pub-id-type': 'doi'})
            if doi_tag:
                return doi_tag.get_text()
    except Exception:
        pass
    return None

def extract_text_from_article(article_id, data_dir):
    """
    Extracts text from a given article, preferring XML over PDF.
    Returns the text and the article DOI.
    """
    xml_path = data_dir / 'XML' / f'{article_id}.xml'
    pdf_path = data_dir / 'PDF' / f'{article_id}.pdf'
    
    article_doi = None
    if xml_path.exists():
        text = get_xml_text(xml_path)
        article_doi = get_article_doi(xml_path)
        if not article_doi:
            # Fallback to article_id if DOI not found in XML
            article_doi = article_id
        return text, article_doi
    
    if pdf_path.exists():
        # NOTE: pdfplumber is not a standard Kaggle library.
        # You would need to install it if allowed or use a different parser.
        # For this example, we will treat PDFs as non-parsable to simulate
        # a real-world scenario where XML is prioritized.
        print(f"Warning: PDF file for {article_id} is not supported in this simplified script.")
    
    return "", None

# --- 2. Identifier Detection ---

DOI_REGEX = r'(?:doi:|https?://(?:dx\.)?doi\.org/)(10\.\d{4,9}/[^\s"\'<>()]+)'
ACCESSION_ID_REGEX = r'\b(GSE\d+|PDB\s?\d[a-zA-Z\d]{3}|E-MEXP-\d+|E-MTAB-\d+|PRJ[AEBD]\d+)\b'

def find_all_identifiers(text):
    """Finds all DOIs and Accession IDs in a given text."""
    found_dois = set(re.findall(DOI_REGEX, text, re.IGNORECASE))
    found_accessions = set(re.findall(ACCESSION_ID_REGEX, text, re.IGNORECASE))
    
    # Add full doi url for consistency
    full_dois = {f"https://doi.org/{d}" for d in found_dois}
    
    return list(full_dois.union(found_accessions))

def get_context(text, identifier, window_size=50):
    """Extracts a fixed-size context window around an identifier."""
    # Escape special characters for regex
    escaped_id = re.escape(identifier)
    
    # Find the start and end of the identifier in the text
    match = re.search(escaped_id, text, re.IGNORECASE)
    if not match:
        return ""
    
    start_pos = max(0, match.start() - window_size)
    end_pos = min(len(text), match.end() + window_size)
    
    return text[start_pos:end_pos]

# --- 3. Citation Type Classification ---

def train_model():
    """Trains a simple classifier on the training data."""
    
    # Load labels
    train_labels_df = pd.read_csv(TRAIN_DIR / '/kaggle/input/make-data-count-finding-data-references/train_labels.csv')
    
    # Feature engineering for training data
    contexts = []
    labels = []
    
    for _, row in train_labels_df.iterrows():
        article_id = row['article_id']
        dataset_id = row['dataset_id']
        citation_type = row['type']
        
        # NOTE: This is a simplification. A robust solution would handle
        # multiple identifiers per article and get context for each.
        text, _ = extract_text_from_article(article_id, TRAIN_DIR)
        
        if text and dataset_id:
            context = get_context(text, dataset_id)
            if context:
                contexts.append(context)
                labels.append(citation_type)

    if not contexts:
        print("Warning: No training data with valid contexts found.")
        return None, None
        
    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = vectorizer.fit_transform(contexts)
    y_train = labels
    
    # Train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model, vectorizer

# --- Main Logic ---

if __name__ == "__main__":
    
    # Train the model and vectorizer on the training data
    model, vectorizer = train_model()
    
    if not model or not vectorizer:
        print("Model training failed. Exiting.")
    else:
        submission_rows = []
        row_id = 0
        
        # Get list of all test articles (XML and PDF)
        test_article_files = set([p.stem for p in (TEST_DIR / 'XML').glob('*.xml')])
        
        # Process each test article
        for article_id in sorted(list(test_article_files)):
            print(f"Processing {article_id}...")
            
            # Step 1: Extract text and article DOI
            text, article_doi = extract_text_from_article(article_id, TEST_DIR)
            
            if not text:
                print(f"  - No parsable text found. Skipping.")
                continue
            
            # Step 2: Find all dataset identifiers
            found_identifiers = find_all_identifiers(text)
            
            if not found_identifiers:
                print(f"  - No identifiers found. Skipping.")
                continue
            
            # Step 3: Classify each identifier
            for dataset_id in found_identifiers:
                context = get_context(text, dataset_id)
                
                # Check if context is non-empty before prediction
                if context:
                    context_vector = vectorizer.transform([context])
                    citation_type = model.predict(context_vector)[0]
                else:
                    # Default to 'Secondary' if context is not found
                    citation_type = 'Secondary'
                
                submission_rows.append({
                    'row_id': row_id,
                    'article_id': article_doi,
                    'dataset_id': dataset_id,
                    'type': citation_type
                })
                row_id += 1
                
        # Step 4: Create and save the submission file
        submission_df = pd.DataFrame(submission_rows)
        submission_df.to_csv('submission.csv', index=False)
        
        print("\nSubmission file 'submission.csv' created successfully.")
        print(submission_df.head())

Processing 10.1002_2017jc013030...
Processing 10.1002_anie.201916483...
  - No identifiers found. Skipping.
Processing 10.1002_anie.202005531...
  - No parsable text found. Skipping.
Processing 10.1002_anie.202007717...
  - No identifiers found. Skipping.
Processing 10.1002_chem.201902131...
  - No identifiers found. Skipping.
Processing 10.1002_chem.201903120...
Processing 10.1002_chem.202000235...
  - No identifiers found. Skipping.
Processing 10.1002_chem.202001412...
  - No identifiers found. Skipping.
Processing 10.1002_chem.202001668...
  - No identifiers found. Skipping.
Processing 10.1002_chem.202003167...
Processing 10.1002_cssc.202201821...
Processing 10.1002_ece3.3985...
Processing 10.1002_ece3.4466...
Processing 10.1002_ece3.5260...
Processing 10.1002_ece3.5395...
Processing 10.1002_ece3.6144...
Processing 10.1002_ece3.6303...
Processing 10.1002_ece3.6784...
  - No identifiers found. Skipping.
Processing 10.1002_ece3.961...
Processing 10.1002_ece3.9627...
Processing 10.1002