# Workflow
## Jocelyn Zhu, Kat Nykiel

In [None]:
# Import required libraries
import json
import numpy as np
import pandas as pd 
import plotly.graph_objects as go

### Load Expert-Institution Pairs

This `.csv` file contins the entity pairs obtained using named entity recognition, a type of natural language processing. This will work for now, but we will replace this file with a set of pairs obtained using LLMs.

In [None]:
# Read the JSON file of expert pairs
df = pd.read_csv('entity_pairs.csv',index_col=0) # Reads CSV file
df.head()

## Analyze the frequency at which each university is cited

In [None]:
# Count the number of unique institutions
universities = df['Institution'].values
unique_universities = list(set(universities))
universities_counts = np.zeros(len(unique_universities))

# Count the number of times each institution appears in the data
for i, unique in enumerate(unique_universities):
   count = 0
   for j in range(len(universities)):
       if (universities[j] == unique):
           count += 1
   universities_counts[i] = count

# Display histogram of entity data
fig = go.Figure()
fig.add_trace(go.Histogram(x=universities_counts, hovertext = unique_universities))
fig.update_layout(xaxis_title='Institutions',template='simple_white', yaxis_title='Count', width=600,height=400, title='Distribution of Academic Institutions')
fig.show()

In [None]:
# Source: https://www.kaggle.com/datasets/theriley106/university-statistics?resource=download

# File path
file_path = "schoolinfo.json"

# Open and read the JSON file
with open(file_path, "r") as file:
    json_data = json.load(file) # json.load() returns JSON data as a Python dictionary, which is stored in the 'data' object

# Cross-referencing CSV and JSON files
'''
Source: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html
Idea:
    - Create a dataframe for json_data
    - Use the Pandas merge() to merge the two DataFrame objects
    - OR: Search through filtered_articles list and json_data dictionary
    - .count() or .value_counts()
'''

# Creating DataFrame object for demographic data from the school, merging it with the entity pairs DataFrame
demographic_df = pd.DataFrame.from_dict(json_data)
merge_df = pd.merge(demographic_df, df, left_on='displayName', right_on='Institution')
merge_df.head()


## Analyze patterns in institutions from which experts are cited

In [None]:
# Histograms

# Institutional Control
fig = go.Figure()
fig.add_trace(go.Histogram(x=merge_df['institutionalControl']))
fig.update_layout(xaxis_title='Institutional Control',template='simple_white', yaxis_title='Count', width=600,height=400, title='Distribution of Institutional Control between Academic Institutions')
fig.show()

# Location
fig = go.Figure()
fig.add_trace(go.Histogram(x=merge_df['city']))
fig.update_layout(xaxis_title='Location',template='simple_white', yaxis_title='Count', width=600,height=400, title='Distribution of Academic Institution Locations')
fig.show()

# SAT Averages
fig = go.Figure()
fig.add_trace(go.Histogram(x=merge_df['sat-avg']))
fig.update_layout(xaxis_title='SAT averages',template='simple_white', yaxis_title='Count', width=600,height=400, title='Distribution of SAT Averages between Academic Institutions')
fig.show()

In [None]:
# Plot histogram of university frequency, with separate traces for public and private universities
# Separate public and private universities
fig = go.Figure()
for control in ['public','private']:

    universities = merge_df[merge_df['institutionalControl'] == control]['Institution'].values

    unique_universities = list(set(universities))
    universities_counts = np.zeros(len(unique_universities))

    for i, unique in enumerate(unique_universities):
        count = 0
        for j in range(len(universities)):
            if (universities[j] == unique):
                count += 1
        universities_counts[i] = count

    # Return a string of all universities at this count
    hovertexts = []
    for i in range(int(max(universities_counts+1))):
        text = ''
        for j,university in enumerate(unique_universities):
            if (universities_counts[j] == i):
                text += university + '<br>'
        hovertexts.append(text)

    n=5
    binned_texts = [''.join(hovertexts[i:i+5]) for i in range(0, len(hovertexts), 5)]

    # Display histogram of entity data
    fig.add_trace(go.Histogram(x=universities_counts, hovertext = binned_texts,name=control))
    
fig.update_layout(xaxis_title='# of citiations',template='simple_white', yaxis_title='# of universities', width=800,height=600, title='Distribution of Academic Institutions',font=dict(size=16),hoverlabel=dict(font=dict(size=10)),legend=dict(x=.85,y=.2))
fig.show()


## Display the geographic distribution of expert institutions

Here, we show two figures; the first a simple demo of this plotting method using population as marker size, and the second a set of markers indicating how often institutions from a given university are cited.

In [None]:
import plotly.graph_objects as go
import pandas as pd

geo_df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
geo_df['name'] = geo_df['name'].str.strip()

map_df = pd.merge(merge_df,geo_df,left_on='city',right_on='name')

fig = go.Figure()

scale=30000

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = map_df['lon'],
    lat = map_df['lat'],
    marker = dict(
            size = map_df['pop']/scale,
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        ))

fig.update_layout(
    geo = dict(
        scope = 'usa',
        landcolor = 'rgb(217, 217, 217)',
    )
)

fig.show()

In [None]:
cities = map_df['city'].values

unique_cities = list(set(cities))
city_counts = np.zeros(len(unique_cities))

for i, unique in enumerate(unique_cities):
    count = 0
    for j in range(len(cities)):
        if (cities[j] == unique):
            count += 1
    city_counts[i] = count


In [None]:
city_count_df = pd.DataFrame({'city': unique_cities, 'count': city_counts})

In [None]:
# merge city_count_df with geo_df
city_count_df = pd.merge(city_count_df, geo_df, left_on='city',right_on='name')
city_count_df.head()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = city_count_df['lon'],
    lat = city_count_df['lat'],
    marker = dict(
            size = city_count_df['count'],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
    text = city_count_df['city'],
        ))

fig.update_layout(
    geo = dict(
        scope = 'usa',
        landcolor = 'rgb(217, 217, 217)',

    ),
    width=1000,
    height=800,
    template="simple_white",
    title_text = 'Expert Citations in News Articles by City',
)

fig.show()