In [1]:
import requests
import json

In [2]:
# The URL of the API endpoint
url = 'https://experts.illinois.edu/ws/api/524/research-outputs'

# Adjusted fields parameter to include nested details explicitly
fields = 'uuid,title.value,subTitle.value,publicationStatuses.publicationDate.year,personAssociations.person.uuid,personAssociations.person.name.text.value,electronicVersions.doi,keywordGroups.keywordContainers.structuredKeyword.term.text.value,abstract.text.value'

params = {
    'apiKey': '5fb8492f-6462-49fd-b3dd-69c896549ff8',
    'size': 20,
    'offset': 0,
    'fields': fields
}

# Headers to request a JSON response
headers = {
    'Accept': 'application/json',
}

# Making the API call
response = requests.get(url, headers=headers, params=params)

if response.status_code == 200:
    data = response.json()
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}, Response content: {response.text}")
    data = {'items': []}  # Default to an empty list for further processing

# Initialize a list to hold the extracted and refined information
refined_info = []

# Iterate through each item in the items list
for item in data['items']:
    # Extract relevant article information
    article_info = {
        'uuid': item['uuid'],
        'title': item['title']['value'],
        'subtitle': item.get('subTitle', {}).get('value', 'N/A'),  # Using 'N/A' if subtitle is not present
        'publication_year': item['publicationStatuses'][0]['publicationDate']['year'] if item['publicationStatuses'] else 'N/A',
        'keywords': [kw['structuredKeyword']['term']['text'][0]['value'] for kw in item.get('keywordGroups', [{}])[0].get('keywordContainers', []) if kw.get('structuredKeyword')],
        'doi': next((ev['doi'] for ev in item.get('electronicVersions', []) if ev.get('doi')), None),
        'authors': [],
        'abstract': item.get('abstract', {}).get('text', [{}])[0].get('value', 'N/A') if item.get('abstract', {}).get('text') else 'N/A'
    }

    # Extract and filter author names and UUIDs from personAssociations
    for pa in item.get('personAssociations', []):
        if 'person' in pa and 'name' in pa['person']:
            name_parts = pa['person']['name']['text'][0]['value'].split()
            if len(name_parts) >= 2:
                first_name, last_name = name_parts[0], ' '.join(name_parts[1:])
            else:
                first_name, last_name = name_parts[0], "Unknown"
            author_name = f"{first_name} {last_name}"
            
            # Add author regardless of name being known or unknown
            author_uuid = pa['person']['uuid'] if 'uuid' in pa['person'] else 'N/A'
            article_info['authors'].append({'name': author_name, 'uuid': author_uuid})

    # Add the article info to the list without checking if authors exist
    refined_info.append(article_info)

# Convert the refined information to a JSON string for output or saving to a file
refined_json = json.dumps(refined_info, indent=4)

# Print the cleaned data
print(refined_json)

# Optionally, save the cleaned data to a file
filename = 'cleaned_research_data.json'
with open(filename, 'w') as f:
    f.write(refined_json)
print(f"Cleaned data saved to {filename}")


[
    {
        "uuid": "478464b1-4c9c-416a-97f9-e870286dd9f4",
        "title": "The teacher as designer",
        "subtitle": "Pedagogy in the new media age",
        "publication_year": 2010,
        "keywords": [
            "Education",
            "Computer Science Applications"
        ],
        "doi": "https://doi.org/10.2304/elea.2010.7.3.200",
        "authors": [
            {
                "name": "Mary Kalantzis",
                "uuid": "e51bcaac-1bab-4627-8b7c-b08dd4376a3e"
            },
            {
                "name": "William Cope",
                "uuid": "f22afb20-7074-4c82-a8f9-9d2f655fac76"
            }
        ],
        "abstract": "<p>This article outlines a learning intervention which the authors call Learning by Design. The goal of this intervention is classroom and curriculum transformation, and the professional learning of teachers. The experiment involves the practical application of the learning theory to everyday classroom practice. Its ideas a

In [2]:
def fetch_and_process_articles():
    url = 'https://experts.illinois.edu/ws/api/524/research-outputs'
    fields = 'uuid,title.value,subTitle.value,publicationStatuses.publicationDate.year,personAssociations.person.uuid,personAssociations.person.name.text.value,electronicVersions.doi,keywordGroups.keywordContainers.structuredKeyword.term.text.value,abstract.text.value'
    headers = {
        'Accept': 'application/json',
    }
    all_refined_info = []
    params = {
        'apiKey': '5fb8492f-6462-49fd-b3dd-69c896549ff8', 
        'size': 1000, 
        'offset': 0,
        'fields': fields
    }

    total_articles = 225492
    while params['offset'] < total_articles:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('items', [])

            for item in items:
                article_info = {
                    'uuid': item['uuid'],
                    'title': item['title']['value'],
                    'subtitle': item.get('subTitle', {}).get('value', 'N/A'),
                    'publication_year': item['publicationStatuses'][0]['publicationDate']['year'] if item['publicationStatuses'] else 'N/A',
                    #'keywords': [kw['structuredKeyword']['term']['text'][0]['value'] for kw in item.get('keywordGroups', [{}])[0].get('keywordContainers', []) if kw.get('structuredKeyword')],
                    'doi': next((ev['doi'] for ev in item.get('electronicVersions', []) if ev.get('doi')), None),
                    'authors': [],
                    'abstract': item.get('abstract', {}).get('text', [{}])[0].get('value', 'N/A') if item.get('abstract', {}).get('text') else 'N/A'
                }

                for pa in item.get('personAssociations', []):
                    if 'person' in pa and 'name' in pa['person']:
                        name_parts = pa['person']['name']['text'][0]['value'].split()
                        if len(name_parts) >= 2:
                            first_name, last_name = name_parts[0], ' '.join(name_parts[1:])
                        else:
                            first_name, last_name = name_parts[0], "Unknown"
                        author_name = f"{first_name} {last_name}"

                        author_uuid = pa['person']['uuid'] if 'uuid' in pa['person'] else 'N/A'
                        article_info['authors'].append({'name': author_name, 'uuid': author_uuid})

                all_refined_info.append(article_info)

            params['offset'] += params['size']
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}, Response content: {response.text}")
            break

    refined_json = json.dumps(all_refined_info, indent=4)
    filename = 'research_data.json'
    with open(filename, 'w') as f:
        f.write(refined_json)
    print(f"Cleaned data saved to {filename}")

fetch_and_process_articles()


Cleaned data saved to research_data.json


In [3]:
import pandas as pd

with open('research_data.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)
number_of_articles = len(df)

print(number_of_articles)

225492


In [4]:
df['title'] = df.apply(lambda row: row['title'] + ": " + row['subtitle'] if row['subtitle'] != "N/A" else row['title'], axis=1)
df['abstract'] = df['abstract'].str.replace('^<p>', '', regex=True).str.replace('</p>$', '', regex=True)
df.drop(columns=['subtitle'], inplace=True)
#df['keywords'] = df['keywords'].apply(lambda x: ', '.join(x))

In [5]:
df.to_csv('articles_final.tsv', sep='\t', index=False)

In [9]:
df.head(50).to_csv('articles_sample.tsv', sep='\t', index=False)

In [6]:
first_row = df.iloc[5]
first_row

uuid                             5d1e0f52-dbe6-4c43-8f77-c4e4b91b7147
title               Thermal conductivity of κ-Al<sub>2</sub>O<sub>...
publication_year                                                 1998
doi                                  https://doi.org/10.1063/1.367500
authors             [{'name': 'David G Cahill', 'uuid': '40e7c853-...
abstract            The thermal conductivities of α-Al<sub>2</sub>...
Name: 5, dtype: object

In [23]:
import ast

# Load the provided TSV files
people_df = pd.read_csv('people.tsv', sep='\t')
articles_df = pd.read_csv('articles_final.tsv', sep='\t')

def extract_author_uuids(authors_column):
    author_uuids = []
    for authors in authors_column:
        try:
            authors_list = ast.literal_eval(authors)
            uuids = [author['uuid'] for author in authors_list if 'uuid' in author]
            author_uuids.append(uuids)
        except (ValueError, SyntaxError):
            author_uuids.append([])
    return author_uuids

articles_df['author_uuids'] = extract_author_uuids(articles_df['authors'])

# Rename the original 'uuid' column in articles_df to 'article_id'
articles_df = articles_df.rename(columns={'uuid': 'article_id'})

# Explode the articles_df to have one row per author uuid, renaming the exploded uuid to 'people_id'
articles_exploded_df = articles_df.explode('author_uuids').rename(columns={'author_uuids': 'people_id'})

# Rename the 'uuid' column in people_df to 'people_id' for consistency
people_df = people_df.rename(columns={'uuid': 'people_id'})

# Merge the two dataframes based on 'people_id'
merged_df = pd.merge(articles_exploded_df, people_df, on='people_id', how='left')



In [24]:
len(merged_df)

251456

In [25]:
final_df = merged_df.drop(columns=['people_id', 'authors'])
final_df.to_csv('final_data.tsv', sep='\t', index=False)

In [26]:
# Load the TSV files (replace the paths with the correct file paths in your Colab environment)
final_data_path = 'final_data.tsv'
faculty_final_path = 'faculty_final.tsv'

# Read the data from the TSV files
final_data = pd.read_csv(final_data_path, sep='\t')
faculty_final = pd.read_csv(faculty_final_path, sep='\t')

# Clean and normalize email addresses by stripping whitespaces and converting to lowercase
final_data['email_clean'] = final_data['email'].str.strip().str.lower()
faculty_final['contact_clean'] = faculty_final['contact'].str.strip().str.lower()

# Filter the final_data based on emails present in faculty_final
filtered_data = final_data[final_data['email_clean'].isin(faculty_final['contact_clean'])]

# Optionally, drop the 'email_clean' and 'contact_clean' columns after filtering
filtered_data = filtered_data.drop(columns=['email_clean'])

# Save or display the result
filtered_data.to_csv('filtered_faculty_articles.tsv', sep='\t', index=False)

# If you'd like to view the filtered data in Colab
print(filtered_data)


                                  article_id  \
11240   6c6cfc09-745d-48cf-919e-378fd2590d48   
15962   94cf7c32-af6a-4dac-8166-a4c504d71ac3   
16201   c594a9ff-f58f-4054-8e8f-01de30572fa3   
16217   e3378ce2-2e32-4eb3-9992-b519854f1028   
16224   f01839c5-0cba-43cf-8b94-840ae5771ec7   
...                                      ...   
250619  b4ad2ef0-7743-4519-88e7-f6bec285a6c2   
250620  d675d9ea-4592-48d1-8883-4c1a136137c5   
250621  85440571-ac5d-4cb0-aefa-1732cab17a33   
250622  1d756efb-d2ab-4df4-b30f-c7f0dcec4244   
251443  1737e507-d6f3-4419-b5ff-b5eee80c3f2a   

                                                    title  publication_year  \
11240   Predicting individuals' learning success from ...              2011   
15962   Introducing mobile technology in graduate prof...              2014   
16201       Value and growth investing: Review and update              2004   
16217   The Information Environment of China's A and B...              1999   
16224   A cross-correlation 

In [29]:
filtered_data.head(5)

Unnamed: 0,article_id,title,publication_year,doi,abstract,email,name,organization,about
11240,6c6cfc09-745d-48cf-919e-378fd2590d48,Predicting individuals' learning success from ...,2011,https://doi.org/10.1371/journal.pone.0016093,Performance in most complex cognitive and psyc...,dsimons@illinois.edu,Daniel J Simons,"Business Administration, Charles H. Sandage De...",
15962,94cf7c32-af6a-4dac-8166-a4c504d71ac3,Introducing mobile technology in graduate prof...,2014,https://doi.org/10.2304/elea.2014.11.6.543,The insertion of mobile technology in educatio...,gopesh@illinois.edu,Gopesh Jankinath Anand,"Business Administration, Gies College of Business",
16201,c594a9ff-f58f-4054-8e8f-01de30572fa3,Value and growth investing: Review and update,2004,https://doi.org/10.2469/faj.v60.n1.2593,A great deal of academic empirical research ha...,l-chan2@illinois.edu,Kuo Chi Chan,"Gies College of Business, Finance",
16217,e3378ce2-2e32-4eb3-9992-b519854f1028,The Information Environment of China's A and B...,1999,https://doi.org/10.1016/S0020-7063(99)00039-4,"In 1990, three stock exchanges were opened in ...",rashad@illinois.edu,Ahmed Rashad Abdel-Khalik,"Accountancy, Center for Global Studies",
16224,f01839c5-0cba-43cf-8b94-840ae5771ec7,A cross-correlation analysis of Mg II absorpti...,2009,https://doi.org/10.1088/0004-637X/698/1/819,We analyze the cross-correlation of Mg II (λ27...,bigdog@illinois.edu,Robert J Brunner,"Gies College of Business, Astronomy, Accountan...",Robert J. Brunner is a professor in the School...


In [None]:
gies_data_file_path = 'gies_data.tsv'  
gies_data = pd.read_csv(gies_data_file_path, sep='\t')

# Assign a shorter numerical identifier to each article, considering duplicates should have the same identifier
article_mapping = pd.factorize(gies_data['article_id'])[0] + 1  # Start numbering from 1
gies_data['article_number'] = article_mapping

# Save the updated gies_data to a new file if needed
gies_data.to_csv('gies_final.tsv', index=False, sep='\t')

In [32]:
file_path = 'gies_final.tsv'  # Replace with your file path
data = pd.read_csv(file_path, sep='\t')

data_cleaned = data.drop_duplicates(subset='article_id').drop(columns=['email', 'name', 'organization', 'about'])

# Save the cleaned data to a new file
data_cleaned.to_csv('cleaned_articles_data.tsv', index=False, sep='\t')

In [1]:
import os
from openai import AzureOpenAI

In [6]:
os.environ["AZURE_OPENAI_API_KEY"] = "ec037c356dce4ae28e79a4fde7f583d1"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://dpi2024.openai.azure.com/"

In [9]:
client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),
  api_version = "2024-03-01-preview",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [56]:
def analyze_article(df):
    prompt = f"""
    Analyze the following article if it is related to any of the UN's 17 Sustainability goal:
    title: {df['title']}
    abstract: {df['abstract']}
    keywords: {df['keywords']}
    
    Respond with JSON that has two keys: "Related goals" and "result", where related goals are intergers of goal number or 'None' and result is ONLY 'yes' or 'no'
    """
    response = client.chat.completions.create(
        model="gpt4-dpi",
        messages=[
            {"role": "system", "content": "Assitant is a classifier to "},
            {"role": "user", "content": prompt}
        ],
        temperature = 0.1,
        response_format = {"type":"json_object"}
    )
    return response.choices[0].message.content

In [65]:
first_row = df.iloc[7]
first_row

uuid                             5131cbae-ba2c-4590-8957-4889c0a4aa0b
title               Laser-driven flyer plates for shock compressio...
publication_year                                                 2014
keywords                                              Instrumentation
doi                                 https://doi.org/10.1063/1.4871361
authors             [{'name': 'Dana D Dlott', 'uuid': '8eaa816f-36...
abstract            We investigated the launch and target impact o...
Name: 7, dtype: object

In [66]:
#text-embedding-3-large

print(analyze_article(first_row))

{
  "Related goals": [
    9
  ],
  "result": "yes"
}


In [3]:
import json
import pandas as pd
with open('research_data.json', 'r') as file:
    data = json.load(file)
df = pd.DataFrame(data)
df['title'] = df.apply(lambda row: row['title'] + ": " + row['subtitle'] if row['subtitle'] != "N/A" else row['title'], axis=1)
df['abstract'] = df['abstract'].str.replace('^<p>', '', regex=True).str.replace('</p>$', '', regex=True)
df.drop(columns=['subtitle'], inplace=True)
df['keywords'] = df['keywords'].apply(lambda x: ', '.join(x))

In [5]:
df.groupby('publication_year').count()

Unnamed: 0_level_0,uuid,title,keywords,doi,authors,abstract
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1957,1,1,1,1,1,1
1958,1,1,1,1,1,1
1959,1,1,1,1,1,1
1960,2,2,2,2,2,2
1961,10,10,10,10,10,10
...,...,...,...,...,...,...
2020,9074,9074,9074,8033,9074,9074
2021,8737,8737,8737,7788,8737,8737
2022,7676,7676,7676,6878,7676,7676
2023,6646,6646,6646,6122,6646,6646


In [9]:
df2 = df[df['publication_year'] >= 2020]

In [10]:
len(df2)

32793