In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, HuberRegressor
import pickle
import itertools
import time
import ot
import os
from scipy.stats import rankdata, spearmanr, kendalltau
from jenks import getJenksBreaks, classify, getGVF
import seaborn as sns
from datetime import datetime

In [2]:
n_group = 5

In [3]:
prof = ['Prof-Ay-B/E/E', 'Assoc Prof-Ay-B/E/E', 'Asst Prof-Ay-B/E/E', 'Postdoc-Employee']

In [4]:
uni_excluded = ['University of California, San Francisco', 'University of California, Office of the President']

In [5]:
sum_df = pd.DataFrame()
for position in prof:
    print(position)
    for year in np.arange(2017, 2022, 1):
        path = 'D:/UniversityOfCalifornia/{}_UniversityOfCalifornia.csv'.format(year)
        with open(path, encoding="utf8", errors='ignore') as f:
            df = pd.read_csv(f)
        df = df[~df['EmployerName'].isin(uni_excluded)]
        PROF = df[(df['Position'] == position)]
        PROF = PROF[['Year', 'EmployerName', 'Position', 'TotalWages']]
        PROF.loc[:, 'WageRank'] = rankdata(-PROF['TotalWages'].values, method='ordinal') - 1
        sum_df = pd.concat([PROF, sum_df]) 

Prof-Ay-B/E/E
Assoc Prof-Ay-B/E/E
Asst Prof-Ay-B/E/E
Postdoc-Employee


In [6]:
sum_df.head()

Unnamed: 0,Year,EmployerName,Position,TotalWages,WageRank
13802,2021,"University of California, Berkeley",Postdoc-Employee,64386,442
13803,2021,"University of California, Berkeley",Postdoc-Employee,59371,1287
13804,2021,"University of California, Berkeley",Postdoc-Employee,44732,3336
13805,2021,"University of California, Berkeley",Postdoc-Employee,32173,4035
13806,2021,"University of California, Berkeley",Postdoc-Employee,21342,4679


In [7]:
df_5yr = sum_df[sum_df['Year'].isin([2017, 2018, 2019, 2020, 2021])]

In [10]:
df_5yr.groupby(['Position', 'EmployerName'])['TotalWages'].max()

Position             EmployerName                           
Assoc Prof-Ay-B/E/E  University of California, Berkeley         582942
                     University of California, Davis            313901
                     University of California, Irvine           387084
                     University of California, Los Angeles      494989
                     University of California, Merced           230762
                     University of California, Riverside        301166
                     University of California, San Diego        436367
                     University of California, Santa Barbara    430169
                     University of California, Santa Cruz       235600
Asst Prof-Ay-B/E/E   University of California, Berkeley         356679
                     University of California, Davis            262833
                     University of California, Irvine           331450
                     University of California, Los Angeles      408104
                

In [None]:
sum_df[sum_df['Year'] == 2021].groupby(['Position', 'EmployerName'])['TotalWages'].count()

## Consider 5 years only

In [9]:
grouped_df = pd.DataFrame()
for position in prof:
    for year in np.arange(2017, 2022, 1):
        df = sum_df[(sum_df['Year'] == year) & (sum_df['Position'] == position)]
        df = df.groupby(['EmployerName'])['WageRank'].median().reset_index()
        df.loc[:, 'Year'] = year
        df.loc[:, 'Position'] = position
        grouped_df = pd.concat([df, grouped_df])
grouped_df = grouped_df.reset_index(drop=True)

In [10]:
usnews = pd.read_csv('D:/UniversityOfCalifornia/USNews_Ranking.csv')

In [11]:
for year in np.arange(2012, 2024, 1):
#     usnews.loc[:, 'c_' + str(year)] = usnews[str(year)].values
    usnews.loc[:, 'c_' + str(year)] = rankdata(usnews[str(year)].values, method='ordinal') - 1

In [12]:
university = grouped_df['EmployerName'].unique()
n_obs = grouped_df.shape[0]
for idx in range(n_obs):
    uni = grouped_df.iloc[idx]['EmployerName']
    year = grouped_df.iloc[idx]['Year']
#     grouped_df.loc[idx, 'uni_group_5'] = usnews[usnews['University Name'] == uni]['c_' + str(year)].values//2
    grouped_df.loc[idx, 'uni_rank'] = usnews[usnews['University Name'] == uni]['c_' + str(year)].values

In [49]:
n_group = 4

In [50]:
for position in prof:
    for year in np.arange(2017, 2022, 1):
        sliced_idx = (grouped_df['Year'] == year) & (grouped_df['Position'] == position)
        data = grouped_df[sliced_idx]['WageRank'].values
        wage_list = list(rankdata(-data, method='ordinal') - 1)
        wage_breaks = getJenksBreaks(wage_list, n_group)
        n_obs = data.shape[0]
        wage_group = np.zeros(n_obs)
        for fidx in range(n_obs):
            wage_group[fidx] = classify(wage_list[fidx], wage_breaks)
        grouped_df.loc[sliced_idx, 'wage_group'] = n_group - wage_group
#         grouped_df.loc[sliced_idx, 'wage_group_5'] = (rankdata(data, method='ordinal') - 1)//2
#         grouped_df.loc[sliced_idx, 'wage_group_9'] = rankdata(data, method='ordinal') - 1

In [51]:
for position in prof:
    for year in np.arange(2017, 2022, 1):
        sliced_idx = (grouped_df['Year'] == year) & (grouped_df['Position'] == position)
        data = grouped_df[sliced_idx]['uni_rank'].values
        uni_list = list(rankdata(-data, method='ordinal') - 1)
        uni_breaks = getJenksBreaks(uni_list, n_group)
        n_obs = data.shape[0]
        uni_group = np.zeros(n_obs)
        for fidx in range(n_obs):
            uni_group[fidx] = classify(uni_list[fidx], uni_breaks)
        grouped_df.loc[sliced_idx, 'uni_group'] = n_group - uni_group

In [34]:
grouped_df

Unnamed: 0,EmployerName,WageRank,Year,Position,uni_rank,wage_group,uni_group
0,"University of California, Berkeley",3268.5,2021,Postdoc-Employee,1.0,3.0,0.0
1,"University of California, Davis",3039.0,2021,Postdoc-Employee,5.0,1.0,2.0
2,"University of California, Irvine",3215.5,2021,Postdoc-Employee,4.0,3.0,2.0
3,"University of California, Los Angeles",2822.0,2021,Postdoc-Employee,0.0,0.0,0.0
4,"University of California, Merced",3604.0,2021,Postdoc-Employee,8.0,4.0,4.0
...,...,...,...,...,...,...,...
175,"University of California, Merced",850.5,2017,Prof-Ay-B/E/E,7.0,3.0,3.0
176,"University of California, Riverside",696.5,2017,Prof-Ay-B/E/E,8.0,2.0,4.0
177,"University of California, San Diego",557.0,2017,Prof-Ay-B/E/E,4.0,1.0,2.0
178,"University of California, Santa Barbara",541.0,2017,Prof-Ay-B/E/E,2.0,0.0,1.0


In [53]:
s_rho = np.zeros(4)
k_rho = np.zeros(4)
for k in range(4):
    title = prof[k]
    s_rho[k] = grouped_df[grouped_df['Position'] == title][['WageRank', 'uni_rank']].corr(method='spearman').iloc[0, 1]
    k_rho[k] = grouped_df[grouped_df['Position'] == title][['WageRank', 'uni_rank']].corr(method='kendall').iloc[0, 1]
    print(s_rho[k], title)

0.7800520078661819 Prof-Ay-B/E/E
0.746583336330717 Assoc Prof-Ay-B/E/E
0.8675630561325167 Asst Prof-Ay-B/E/E
0.4029362768914849 Postdoc-Employee


In [54]:
np.round(s_rho, 3)

array([0.78 , 0.747, 0.868, 0.403])

In [55]:
np.round(k_rho, 3)

array([0.616, 0.585, 0.707, 0.298])

In [56]:
s_rho

array([0.78005201, 0.74658334, 0.86756306, 0.40293628])

In [57]:
k_rho

array([0.61613575, 0.58538198, 0.70663379, 0.29784267])

In [52]:
diff = np.zeros(4)
for k in range(4):
    title = prof[k]
    sliced_df = grouped_df[grouped_df['Position'] == title]
    diff[k] = np.abs(sliced_df['uni_group'] - sliced_df['wage_group']).mean()
    print('Title', title, 'Diff', diff[k])

print(spearmanr(s_rho, diff))
print(kendalltau(k_rho, diff))

Title Prof-Ay-B/E/E Diff 0.4888888888888889
Title Assoc Prof-Ay-B/E/E Diff 0.5777777777777777
Title Asst Prof-Ay-B/E/E Diff 0.26666666666666666
Title Postdoc-Employee Diff 0.9777777777777777
SpearmanrResult(correlation=-1.0, pvalue=0.0)
KendalltauResult(correlation=-1.0, pvalue=0.08333333333333333)


In [None]:
# 2 groups
Title Prof-Ay-B/E/E Diff 0.2222222222222222
Title Assoc Prof-Ay-B/E/E Diff 0.17777777777777778
Title Asst Prof-Ay-B/E/E Diff 0.13333333333333333
Title Postdoc-Employee Diff 0.4444444444444444
SpearmanrResult(correlation=-0.7999999999999999, pvalue=0.20000000000000007)
KendalltauResult(correlation=-0.6666666666666669, pvalue=0.3333333333333333)

In [None]:
# 3 groups
Title Prof-Ay-B/E/E Diff 0.26666666666666666
Title Assoc Prof-Ay-B/E/E Diff 0.4
Title Asst Prof-Ay-B/E/E Diff 0.2222222222222222
Title Postdoc-Employee Diff 0.6666666666666666
SpearmanrResult(correlation=-1.0, pvalue=0.0)
KendalltauResult(correlation=-1.0, pvalue=0.08333333333333333)

In [None]:
# 4 groups
Title Prof-Ay-B/E/E Diff 0.4888888888888889
Title Assoc Prof-Ay-B/E/E Diff 0.5777777777777777
Title Asst Prof-Ay-B/E/E Diff 0.26666666666666666
Title Postdoc-Employee Diff 0.9777777777777777
SpearmanrResult(correlation=-1.0, pvalue=0.0)
KendalltauResult(correlation=-1.0, pvalue=0.08333333333333333)

In [None]:
# Five groups
Title Prof-Ay-B/E/E Diff 0.8
Title Assoc Prof-Ay-B/E/E Diff 0.5777777777777777
Title Asst Prof-Ay-B/E/E Diff 0.4444444444444444
Title Postdoc-Employee Diff 1.1111111111111112
SpearmanrResult(correlation=-0.7999999999999999, pvalue=0.20000000000000007)
KendalltauResult(correlation=-0.6666666666666669, pvalue=0.3333333333333333)

In [None]:
# Six groups
Title Prof-Ay-B/E/E Diff 1.0222222222222221
Title Assoc Prof-Ay-B/E/E Diff 0.7555555555555555
Title Asst Prof-Ay-B/E/E Diff 0.4888888888888889
Title Postdoc-Employee Diff 1.288888888888889
SpearmanrResult(correlation=-0.7999999999999999, pvalue=0.20000000000000007)
KendalltauResult(correlation=-0.6666666666666669, pvalue=0.3333333333333333)