In [None]:
# import pymongo


# # --- Step 1: Extract Journals from MongoDB (OpenAlex Data) ---

# # Connect to MongoDB
# client = pymongo.MongoClient('localhost', 27017)

# # Access the 'dd-usda' database and 'publications' collection
# db = client['dd-usda']
# collection = db['publications']

# # Define the projection to include only 'journals' and 'year'
# projection = {
#     'journals': 1,
#     'year': 1,
#     '_id': 0
# }

# query = {
#     'author_american': True,
#     '$or': [
#         {
#             'aliases': {
#                 '$in': [
#                     "USDA", "USDA Census", "US Department of Agriculture",
#                     "United States Department of Agriculture", "NASS", "NASS Census of Agriculture"
#                 ]
#             }
#         },
#         {
#             'second_alias': {
#                 '$in': [
#                     "USDA", "USDA Census", "US Department of Agriculture",
#                     "United States Department of Agriculture", "NASS", "NASS Census of Agriculture"
#                 ]
#             }
#         }
#     ]
# }

# # Retrieve all documents
# cursor = collection.find(query, projection)
# documents = list(cursor)

# # Extract (journal_title, year) pairs
# journal_year_list = []

# for doc in documents:
#     year = doc.get('year')
#     journals = doc.get('journals', [])
    
#     # Skip documents without a year
#     if year is None:
#         continue
    
#     # Process each journal
#     for journal in journals:
#         title = journal.get('name')
#         if title:
#             # Append the journal and year to the list
#             journal_year_list.append({'year': int(year), 'title': title.strip()})

# # Create DataFrame for OpenAlex journals
# openalex_journals = pd.DataFrame(journal_year_list)

# # Remove duplicates
# openalex_journals.drop_duplicates(inplace=True)

# # Ensure 'year' is of integer type
# openalex_journals['year'] = openalex_journals['year'].astype(int)

In [2]:
import pandas as pd
openalex_journals = pd.read_csv("openalex_journals.csv")

# --- Step 2: Load Journals from Scopus CSV File ---
# From Sciserver, database DemocratizingData_USDA_2023, query: # From Sciserver, database DemocratizingData_USDA_2023, query: select distinct publication.year, journal.title from publication join journal on publication.journal_id = journal.id
scopus_journals = pd.read_csv('scopus_journals.csv')

# Remove duplicates
scopus_journals.drop_duplicates(inplace=True)

# Ensure 'year' is of integer type
scopus_journals['year'] = scopus_journals['year'].astype(int)

# Strip whitespace from titles
scopus_journals['title'] = scopus_journals['title'].str.strip()

# --- Step 3: Determine Journals Unique to Each Source and Those in Both, by Year ---

# Combine years from both datasets
years = sorted(set(openalex_journals['year']).union(set(scopus_journals['year'])))

# Initialize a list to store the results
results = []

for year in years:
    # Get journals for the current year from OpenAlex
    oa_journals_set = set(
        openalex_journals[openalex_journals['year'] == year]['title'].str.lower().str.strip()
    )
    
    # Get journals for the current year from Scopus
    scopus_journals_set = set(
        scopus_journals[scopus_journals['year'] == year]['title'].str.lower().str.strip()
    )
    
    # Journals only in OpenAlex
    only_openalex = oa_journals_set - scopus_journals_set
    
    # Journals only in Scopus
    only_scopus = scopus_journals_set - oa_journals_set
    
    # Journals in both
    both = oa_journals_set & scopus_journals_set
    
    # Append the counts to the results list
    results.append({
        'year': year,
        'only_openalex': len(only_openalex),
        'only_scopus': len(only_scopus),
        'both': len(both)
    })

# Create a Results DataFrame
results_df = pd.DataFrame(results)

# Sort the DataFrame by year
results_df = results_df.sort_values('year')

# Display the results
print("\nJournal Counts by Year and Source:")
print(results_df)



Journal Counts by Year and Source:
    year  only_openalex  only_scopus  both
0   1991              0            1     0
1   1992              0            1     0
2   1993              0            2     0
3   1994              0            7     0
4   1995              0            2     0
5   1996              0           12     0
6   1997              0           11     0
7   1998              0           13     0
8   1999              0           15     0
9   2000              0           10     0
10  2001              0           13     0
11  2002              0           15     0
12  2003              0           21     0
13  2004              0           26     0
14  2005              0           26     0
15  2006              0           44     0
16  2007              0           29     0
17  2008              0           29     0
18  2009              0           49     0
19  2010              0           57     0
20  2011              0           46     0
21  2012          