<img align="left" src="All-sample-files/CC_BY.png"><br />

Created by Amy Kirchhoff under [Creative Commons CC BY License](https://creativecommons.org/licenses/by/4.0/)<br />

On June 15, 2025, JSTOR has launched the [new text analysis support service](https://www.jstor.org/ta-support/) to replace the old Data for Research service provided by [Constellate](https://labs.jstor.org/projects/text-mining/), which has sunset June 30, 2025. For Constellate users who prefer to work with the old Constellate dataset format, this notebook helps you convert the new JSTOR text analysis format to the Constellate format. In this way, the existing Constellate notebooks can continue to work on your data files. 

Please read the instructions in the code cells closely. 

In [None]:
#!/usr/bin/env python3
"""
JSTOR Text Analysis to Constellate Format Converter

This notebook converts the new JSTOR text analysis format back to the Constellate 
format so existing Constellate notebooks can continue to work.

Input files:
- JSTOR metadata JSONL file (compressed with gzip)
- JSTOR full-text dataset JSONL file (compressed with gzip)

Output:
- Constellate-format JSONL file with metadata, ngrams, and full text combined
"""

import json
import gzip
import re
from collections import Counter, defaultdict
from pathlib import Path
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

class JSTORToConstellateConverter:
    """
    Converts JSTOR text analysis format to Constellate format
    """
    
    def __init__(self):
        self.metadata_cache = {}
        self.stop_words = set(stopwords.words('english'))
        
    def load_metadata(self, metadata_file_path):
        """
        Load JSTOR metadata file and create a lookup dictionary
        
        Args:
            metadata_file_path (str): Path to the JSTOR metadata JSONL.gz file
        """
        logger.info(f"Starting metadata load from {metadata_file_path}")
        logger.info("This may take several minutes for large files...")
        
        # Determine if file is compressed
        is_compressed = metadata_file_path.endswith('.gz')
        open_func = gzip.open if is_compressed else open
        mode = 'rt' if is_compressed else 'r'
        
        logger.info(f"Opening {'compressed' if is_compressed else 'uncompressed'} metadata file...")
        
        with open_func(metadata_file_path, mode, encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                # Update progress on same line every 10,000 records, log milestone every 100,000
                if line_num % 100000 == 0:
                    print(f"\rProcessing metadata record {line_num:,}...", end="", flush=True)
                #if line_num % 100000 == 0:
                 #print()  # Move to new line for milestone
                  #  logger.info(f"✓ Processed {line_num:,} metadata records so far...")
                    
                try:
                    metadata = json.loads(line.strip())
                    item_id = metadata.get('item_id')
                    if item_id:
                        self.metadata_cache[item_id] = metadata
                except json.JSONDecodeError as e:
                    print()  # Move to new line for warning
                    logger.warning(f"Failed to parse metadata line {line_num}: {e}")
                    
        print()  # Move to new line when done
        logger.info(f"✓ Metadata loading complete! Loaded {len(self.metadata_cache):,} metadata records into cache")
    
    def clean_text_for_ngrams(self, text):
        """
        Clean and tokenize text for n-gram generation, following Constellate preprocessing
        
        Args:
            text (str): Raw text to clean
            
        Returns:
            list: List of cleaned tokens
        """
        if not text:
            return []
            
        # Convert to lowercase
        text = text.lower()
        
        # Tokenize using NLTK
        tokens = word_tokenize(text)
        
        # Clean tokens similar to Constellate preprocessing
        cleaned_tokens = []
        for token in tokens:
            # Keep only alphabetic tokens with length >= 4
            if token.isalpha() and len(token) >= 4 and token not in self.stop_words:
                cleaned_tokens.append(token)
                
        return cleaned_tokens
    
    def generate_ngrams(self, tokens, n):
        """
        Generate n-grams from tokens
        
        Args:
            tokens (list): List of tokens
            n (int): N-gram size (1, 2, or 3)
            
        Returns:
            Counter: Counter object with n-gram counts
        """
        if len(tokens) < n:
            return Counter()
            
        ngrams = []
        for i in range(len(tokens) - n + 1):
            ngram = ' '.join(tokens[i:i + n])
            ngrams.append(ngram)
            
        return Counter(ngrams)
    
    def convert_jstor_to_constellate_record(self, jstor_full_text_record):
        """
        Convert a single JSTOR full-text record to Constellate format
        
        Args:
            jstor_full_text_record (dict): JSTOR full-text record
            
        Returns:
            dict: Constellate-format record or None if metadata not found
        """
        item_id = jstor_full_text_record.get('item_id')
        if not item_id:
            logger.warning("Full-text record missing item_id")
            return None
            
        # Get metadata for this item
        metadata = self.metadata_cache.get(item_id)
        if not metadata:
            logger.warning(f"No metadata found for item_id: {item_id}")
            return None
            
        # Get full text - it's a list of strings (one per page)
        full_text_pages = jstor_full_text_record.get('full_text', [])
        full_text = ' '.join(full_text_pages) if full_text_pages else ''
        
        # Generate tokens for n-grams
        tokens = self.clean_text_for_ngrams(full_text)
        
        # Generate n-grams
        unigrams = self.generate_ngrams(tokens, 1)
        bigrams = self.generate_ngrams(tokens, 2)
        trigrams = self.generate_ngrams(tokens, 3)
        
        # Build Constellate-format record
        constellate_record = {
            # Basic identifiers - map from JSTOR format
            'id': metadata.get('url', f"jstor:{item_id}"),  # Use JSTOR URL as ID
            'item_id': item_id,
            
            # Metadata fields
            'title': metadata.get('title', ''),
            'creator': self._extract_creators(metadata),
            'isPartOf': metadata.get('isPartOf', ''),
            'publisher': self._extract_publishers(metadata),
            'publicationYear': self._extract_year(metadata.get('published_date')),
            'datePublished': metadata.get('published_date', ''),
            'language': self._extract_primary_language(metadata.get('languages', [])),
            'docType': self._map_content_type(metadata.get('content_type')),
            'docSubType': metadata.get('content_subtype', ''),
            
            # DOI and identifiers
            'doi': metadata.get('ithaka_doi', ''),
            'identifier': metadata.get('identifiers', {}),
            
            # Volume/issue info
            'volumeNumber': metadata.get('issue_volume', ''),
            'issueNumber': metadata.get('issue_number', ''),
            
            # Categories and subjects
            'tdmCategory': metadata.get('discipline_names', []),
            'sourceCategory': metadata.get('discipline_names', []),
            
            # Full text
            'fullText': full_text_pages,
            
            # N-grams in Constellate format
            'unigramCount': dict(unigrams),
            'bigramCount': dict(bigrams),
            'trigramCount': dict(trigrams),
            
            # Additional fields
            'wordCount': len(tokens),
            'outputFormat': ['unigrams', 'bigrams', 'trigrams', 'fullText'],
            
            # References if available
            'references': jstor_full_text_record.get('references', [])
        }
        
        return constellate_record
    
    def _extract_creators(self, metadata):
        """Extract creator names in Constellate format"""
        creators_string = metadata.get('creators_string', '')
        creators_list = metadata.get('creators', [])
        
        if creators_list:
            # Use structured creator data if available
            creator_names = []
            for creator in creators_list:
                first = creator.get('first_name', '')
                last = creator.get('last_name', '')
                if first and last:
                    creator_names.append(f"{first} {last}")
                elif last:
                    creator_names.append(last)
            return creator_names
        elif creators_string:
            # Fall back to creator string, split on common separators
            return [name.strip() for name in re.split(r'[,;]', creators_string) if name.strip()]
        else:
            return []
    
    def _extract_publishers(self, metadata):
        """Extract publisher info"""
        publishers = metadata.get('publishers', [])
        return publishers[0] if publishers else ''
    
    def _extract_year(self, date_string):
        """Extract year from date string"""
        if date_string and len(date_string) >= 4:
            return int(date_string[:4])
        return None
    
    def _extract_primary_language(self, languages):
        """Extract primary language"""
        return languages[0] if languages else 'eng'
    
    def _map_content_type(self, jstor_content_type):
        """Map JSTOR content type to Constellate doc type"""
        mapping = {
            'article': 'article',
            'book': 'book', 
            'chapter': 'chapter',
            'report': 'report'
        }
        return mapping.get(jstor_content_type, jstor_content_type or 'article')
    
    def convert_dataset(self, metadata_file_path, fulltext_file_path, output_file_path, limit=None):
        """
        Convert JSTOR dataset to Constellate format
        
        Args:
            metadata_file_path (str): Path to JSTOR metadata JSONL.gz file
            fulltext_file_path (str): Path to JSTOR full-text dataset JSONL.gz file  
            output_file_path (str): Path for output Constellate JSONL.gz file
            limit (int, optional): Limit number of records to process
        """
        # Load metadata first
        logger.info("=" * 60)
        logger.info("STEP 1: Loading metadata cache")
        logger.info("=" * 60)
        self.load_metadata(metadata_file_path)
        
        logger.info("=" * 60)
        logger.info("STEP 2: Processing full-text dataset")
        logger.info("=" * 60)
        logger.info(f"Reading full-text data from: {fulltext_file_path}")
        logger.info(f"Writing converted data to: {output_file_path}")
        if limit:
            logger.info(f"Processing limit set to: {limit:,} records")
        else:
            logger.info("No processing limit set - will process all records")
        
        # Ensure output directory exists
        Path(output_file_path).parent.mkdir(parents=True, exist_ok=True)
        
        # Process full-text dataset
        records_processed = 0
        records_written = 0
        records_skipped = 0
        
        # Determine if files are compressed
        fulltext_is_compressed = fulltext_file_path.endswith('.gz')
        output_is_compressed = output_file_path.endswith('.gz')
        
        logger.info(f"Full-text file: {'compressed' if fulltext_is_compressed else 'uncompressed'}")
        logger.info(f"Output file: {'compressed' if output_is_compressed else 'uncompressed'}")
        
        fulltext_open_func = gzip.open if fulltext_is_compressed else open
        fulltext_mode = 'rt' if fulltext_is_compressed else 'r'
        
        output_open_func = gzip.open if output_is_compressed else open
        output_mode = 'wt' if output_is_compressed else 'w'
        
        logger.info("Starting full-text processing...")
        
        with fulltext_open_func(fulltext_file_path, fulltext_mode, encoding='utf-8') as infile, \
             output_open_func(output_file_path, output_mode, encoding='utf-8') as outfile:
            
            for line_num, line in enumerate(infile, 1):
                if limit and records_processed >= limit:
                    print()  # Move to new line
                    logger.info(f"Reached processing limit of {limit:,} records")
                    break
                    
                try:
                    jstor_record = json.loads(line.strip())
                    constellate_record = self.convert_jstor_to_constellate_record(jstor_record)
                    
                    if constellate_record:
                        outfile.write(json.dumps(constellate_record) + '\n')
                        records_written += 1
                    else:
                        records_skipped += 1
                        
                    records_processed += 1
                    
                    # Update progress on same line every 1,000 records, log milestone every 100,000
                    if records_processed % 1000 == 0:
                        print(f"\rProgress: {records_processed:,} processed | {records_written:,} written | {records_skipped:,} skipped", end="", flush=True)
                    if records_processed % 100000 == 0:
                        print()  # Move to new line for milestone
                        logger.info(f"✓ Milestone: {records_processed:,} processed | {records_written:,} written | {records_skipped:,} skipped")
                        
                except json.JSONDecodeError as e:
                    print()  # Move to new line for warning
                    logger.warning(f"Failed to parse full-text line {line_num}: {e}")
                    records_skipped += 1
                except Exception as e:
                    print()  # Move to new line for error
                    logger.error(f"Error processing line {line_num}: {e}")
                    records_skipped += 1
                    
        print()  # Move to new line when done
                    
        logger.info("=" * 60)
        logger.info("CONVERSION COMPLETE!")
        logger.info("=" * 60)
        logger.info(f"📊 Final Statistics:")
        logger.info(f"   • Total records processed: {records_processed:,}")
        logger.info(f"   • Records successfully written: {records_written:,}")
        logger.info(f"   • Records skipped/failed: {records_skipped:,}")
        logger.info(f"   • Success rate: {(records_written/records_processed*100):.1f}%")
        logger.info(f"📁 Output file: {output_file_path}")
        logger.info(f"✓ Ready for use with Constellate notebooks!")


def main():
    """
    Main function to run the conversion
    
    Update the file paths below to match your downloaded files:
    """
    
    # File paths - UPDATE THESE TO MATCH YOUR FILES
    metadata_file = 'jstor_metadata_2025-06-12.jsonl.gz'  # change this to the path to your downloaded metadata file
    fulltext_file = 'a6ce02a8-a2d6-48b1-af37-79c18934c66f.jsonl.gz' # change this to the path to your requested full-text dataset file
    output_file = 'constellate_format_dataset.jsonl.gz'   # Output file in the Constellate format, ready to be processed by the existing Constellate notebooks; change the file name as you wish
    
    # Optional: limit number of records for testing
    limit = None  # Set to a number like 1000 for testing
    
    # Run conversion
    converter = JSTORToConstellateConverter()
    
    try:
        logger.info("🔧 Initializing JSTOR to Constellate converter...")
        converter.convert_dataset(
            metadata_file_path=metadata_file,
            fulltext_file_path=fulltext_file, 
            output_file_path=output_file,
            limit=limit
        )
        
        print("\n" + "=" * 60)
        print("🎉 CONVERSION SUCCESSFUL!")
        print("=" * 60)
        print(f"📁 Your Constellate-format dataset is ready: {output_file}")
        print(f"🚀 You can now use this file with your existing Constellate notebooks.")
        print(f"💡 Tip: Use the same dataset_reader() function as in your topic modeling notebook!")
        
    except FileNotFoundError as e:
        print("\n" + "❌" * 20)
        print(f"ERROR: Could not find file - {e}")
        print("Please check that the file paths in main() are correct.")
        print("Expected files:")
        print(f"  • Metadata: {metadata_file}")
        print(f"  • Full-text: {fulltext_file}")
    except Exception as e:
        print(f"\n❌ Error during conversion: {e}")
        logger.exception("Full error details:")


if __name__ == "__main__":
    main()