# Exploratory Data Interactive Graph

In [None]:
import pandas as pd

staff = pd.read_csv("Data/departmental_staff_data.csv")
publications = pd.read_csv("Data/departmental_publications_data.csv")

In [8]:
# categorising date to year
publications['Year'] = publications['Date'].str[-4:].astype(int)
all_departments = publications['Department'].unique()
all_years = publications['Year'].unique()

In [9]:
# creating an empty dataframe with all categories
index = pd.MultiIndex.from_product([all_departments, all_years], names=['Department', 'Year'])
empty_df = pd.DataFrame(index=index, columns=['Total Publications', 'Total Authors'])
empty_df = empty_df.fillna(0) # as the data not filled will mean 0 publications for that category
empty_df.reset_index(inplace=True)

In [10]:
# getting data from each row from publications data
for index, row in publications.iterrows():
    department = row['Department']
    year = row['Year']
    total_publications = 1 
    total_authors = row['Number of Authors']
    
    # getting corresponding row index
    row_index = empty_df[(empty_df['Department'] == department) & (empty_df['Year'] == year)].index[0]
    
    # updating numbers
    empty_df.loc[row_index, 'Total Publications'] += total_publications
    empty_df.loc[row_index, 'Total Authors'] += total_authors

In [12]:
# calculating average number of authors 
empty_df['Average Authors'] = empty_df['Total Authors'] / empty_df['Total Publications']
empty_df.fillna(0, inplace=True)

# sorting by year chronologically
empty_df = empty_df.sort_values(by='Year')
empty_df.reset_index(drop=True, inplace=True)

print(empty_df)

                                Department  Year  Total Publications  \
0                         Economic History  1959                   1   
1    Psychological and Behavioural Science  1959                   0   
2                            Social Policy  1959                   0   
3                  International Relations  1959                   0   
4                               Statistics  1959                   0   
..                                     ...   ...                 ...   
763  Psychological and Behavioural Science  2024                  44   
764                             Statistics  2024                  15   
765                                Finance  2024                  11   
766                           Anthropology  2024                   6   
767                            Mathematics  2024                  17   

     Total Authors  Average Authors  
0                1         1.000000  
1                0         0.000000  
2                0   

Below is sized by average number of authors per publication but can also be sized by total number of publications, can't decide which is better

In [18]:
import plotly.express as px

# creating interactive scatter plot
fig = px.scatter(empty_df, x="Average Authors", y="Total Publications", height=400,
                 size="Average Authors", color="Department", animation_frame="Year",
                 animation_group="Department", hover_name="Department", size_max=60,
                 range_x=[0, empty_df['Average Authors'].max() + 1],
                 range_y=[0, empty_df['Total Publications'].max() + 1],
                 title="Publications by Department and Year")

# adding titles
fig.update_layout(
    xaxis_title="Average Number of Authors",
    yaxis_title="Total Publications",
    legend_title="Department",
    title="Publications by Department and Year",
    )

# displaying plot
fig.show()


maybe create more graphs below to show correlation and line plot? to show one element over time and then other element over time and then have each plot point be (department year) and calculate correlation. line of best fit?

In [29]:
import pandas as pd
import plotly.express as px

correlation_coefficient = empty_df['Total Publications'].corr(empty_df['Average Authors'])
print(f"correlation coefficient: {correlation_coefficient}")

# creating hover text
hover_text = empty_df.apply(lambda row: f"Department: {row['Department']} \n Year: {row['Year']}", axis=1)

# creating graph
fig = px.scatter(empty_df, x='Total Publications', y='Average Authors', hover_name=hover_text)

# adding titles
fig.update_layout(
    title='Correlation between Total Publications and Average Authors',
    xaxis_title='Total Publications',
    yaxis_title='Average Authors',
    hovermode='closest'
)

fig.show()

correlation coefficient: 0.47069348461899096
