In [None]:
def safe_get(data, path, default=None):
    """
    Safely retrieves a value from a nested dictionary or list.
    
    :param data: The dictionary or list to retrieve data from.
    :param path: A list of keys/indexes representing the path to the desired value.
    :param default: The value to return if the path does not exist or is invalid.
    :return: The retrieved value or the default value.
    """
    for key in path:
        if isinstance(data, dict):
            data = data.get(key, default)
        elif isinstance(data, list) and isinstance(key, int) and 0 <= key < len(data):
            data = data[key]
        else:
            return default
    return data

def check_slice_or_single(data):
    """
    Ensures that the returned data is always a list, even if it's a single item.
    
    :param data: The data to check, which can be a list or a single value.
    :return: A list of data values.
    """
    if data is None:
        return None
    if isinstance(data, list):
        return data
    return [data]

def get_classification_codes(classification_list):
    """
    Processes the classification list and extracts the relevant classification code.
    
    :param classification_list: List of classifications, can be of different types.
    :return: List of classification codes.
    """
    classification_codes = []
    
    for classification in classification_list:
        if isinstance(classification, dict):
            classification_type = classification.get('@type')
            classification_data = classification.get('classification')
            
            if classification_type == 'ASJC':
                # For ASJC, classification might be a single string or a list
                if isinstance(classification_data, list):
                    classification_codes.extend([item.get('$') for item in classification_data if isinstance(item, dict)])
                else:
                    classification_codes.append(classification_data)  # Single string classification
            
            elif classification_type == 'SUBJABBR':
                # For SUBJABBR, classification might be a single string or a list
                if isinstance(classification_data, list):
                    classification_codes.extend([item.get('$') for item in classification_data if isinstance(item, dict)])
                else:
                    classification_codes.append(classification_data)  # Single string classification
            
            elif classification_type in ['CPXCLASS', 'FLXCLASS']:
                if isinstance(classification_data, list):
                    classification_codes.extend([item.get('classification-code') for item in classification_data if isinstance(item, dict)])
                elif isinstance(classification_data, dict):
                    classification_codes.append(classification_data.get('classification-code'))
    
    return classification_codes


folder_path = '2023'
output_csv = 'paper_2023.csv'

columns = [
    'pid','title', 'pub_date', 'abstract', 'language', 'ref_count',
    'citedby_count', 'author_id', 'subject_areas_id', 'keywords',
    'idxterms', 'classification_code'
]

with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if not os.path.isfile(file_path):
            continue

        try:
            with open(file_path, 'r', encoding='utf-8') as fd:
                data = json.load(fd)

                title = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'citation-title'], None)
                
                pub_year = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'year'], None)
                pub_month = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'month'], None)
                pub_day = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'source', 'publicationdate', 'day'], None)
                pub_date = f"{pub_day}/{pub_month}/{pub_year}" if pub_year and pub_month and pub_day else None

                abstract = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'abstracts'], None)

                language = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'citation-info', 'citation-language', '@language'], None)

                ref_count = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'tail', 'bibliography', '@refcount'], None)

                citedby_count = safe_get(data, ['abstracts-retrieval-response', 'coredata', 'citedby-count'], None)

                authors_list = safe_get(data, ['abstracts-retrieval-response', 'authors', 'author'], None)
                author_id = [author.get('@auid', None) for author in authors_list if isinstance(author, dict)] if authors_list else None

                subject_areas_list = safe_get(data, ['abstracts-retrieval-response', 'subject-areas', 'subject-area'], None)
                subject_areas_id = [subject_area.get('@code', None) for subject_area in subject_areas_list if isinstance(subject_area, dict)] if subject_areas_list else None

                # Keywords extraction (Fix applied here to match your working example)
                authkeywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords'], None)
                keywords = None
                if authkeywords and 'author-keyword' in authkeywords:
                    keywords = [k['$'] for k in authkeywords['author-keyword'] if isinstance(k, dict)]
                keywords = check_slice_or_single(keywords)

                # Handling idxterms
                idxterms_data = safe_get(data, ['abstracts-retrieval-response', 'idxterms'], None)
                if idxterms_data:
                    if isinstance(idxterms_data, dict):  # If idxterms is a dictionary
                        mainterm = idxterms_data.get('mainterm')
                        if isinstance(mainterm, list):
                            idxterms = [i.get('$', None) for i in mainterm]  # Extract the '$' value from each item in the list
                        elif isinstance(mainterm, dict):
                            idxterms = [mainterm.get('$', None)]  # If 'mainterm' is a single dict, extract the '$'
                        else:
                            idxterms = None
                    elif isinstance(idxterms_data, list):  # If idxterms is a list
                        idxterms = [i.get('$', None) for i in idxterms_data if isinstance(i, dict)]  # Loop through and extract '$'
                    else:
                        idxterms = None
                else:
                    idxterms = None

                # Extracting classification codes
                classification_list = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'enhancement', 'classificationgroup', 'classifications'], None)
                classification_code = get_classification_codes(classification_list)
                classification_code = check_slice_or_single(classification_code)

                writer.writerow({
                    'pid': filename,
                    'title': title,
                    'pub_date': pub_date,
                    'abstract': abstract,
                    'language': language,
                    'ref_count': ref_count,
                    'citedby_count': citedby_count,
                    'author_id': author_id,
                    'subject_areas_id': subject_areas_id,
                    'keywords': keywords,
                    'idxterms': idxterms,
                    'classification_code': classification_code
                })
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

print(f"Data extraction complete. Results saved to {output_csv}.")