In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, HuberRegressor
import pickle
import itertools
import time
import ot
import os
from scipy.stats import rankdata
from jenks import getJenksBreaks, classify, getGVF
import seaborn as sns
from datetime import datetime
from scipy.signal import savgol_filter
from scipy.ndimage import gaussian_filter1d

In [2]:
n_group = 5

In [3]:
prof = ['Prof-Ay-B/E/E', 'Assoc Prof-Ay-B/E/E', 'Asst Prof-Ay-B/E/E', 'Postdoc-Employee', 'Teachg Asst-Gship']

In [4]:
uni_excluded = ['University of California, San Francisco', 'University of California, Office of the President']

In [5]:
sum_df = pd.DataFrame()
for position in prof:
    print(position)
    for year in np.arange(2013, 2022, 1):
        path = './UniversityOfCalifornia/{}_UniversityOfCalifornia.csv'.format(year)
        with open(path, encoding="utf8", errors='ignore') as f:
            df = pd.read_csv(f)
        df = df[~df['EmployerName'].isin(uni_excluded)]
        PROF = df[(df['Position'] == position)]
        PROF = PROF[['Year', 'EmployerName', 'Position', 'TotalWages']]
        PROF.loc[:, 'WageRank'] = rankdata(-PROF['TotalWages'].values, method='ordinal') - 1
        sum_df = pd.concat([PROF, sum_df]) 

Prof-Ay-B/E/E
Assoc Prof-Ay-B/E/E
Asst Prof-Ay-B/E/E
Postdoc-Employee
Teachg Asst-Gship


In [None]:
year

In [6]:
grouped_df = pd.DataFrame()
for position in prof:
    for year in np.arange(2013, 2022, 1):
        df = sum_df[(sum_df['Year'] == year) & (sum_df['Position'] == position)]
        df = df.groupby(['EmployerName'])['WageRank'].median().reset_index()
        df.loc[:, 'Year'] = year
        df.loc[:, 'Position'] = position
        grouped_df = pd.concat([df, grouped_df])
grouped_df = grouped_df.reset_index(drop=True)

In [7]:
median_df = pd.DataFrame()
for position in prof:
    for year in np.arange(2013, 2022, 1):
        df = sum_df[(sum_df['Year'] == year) & (sum_df['Position'] == position)]
        df = df.groupby(['EmployerName'])['TotalWages'].median().reset_index()
        df.loc[:, 'Year'] = year
        df.loc[:, 'Position'] = position
        median_df = pd.concat([df, median_df])
median_df = median_df.reset_index(drop=True)

In [8]:
median_df.to_csv('median.csv', index=False)

In [9]:
usnews = pd.read_csv('./UniversityOfCalifornia/USNews_Ranking.csv')

In [10]:
for year in np.arange(2012, 2024, 1):
#     usnews.loc[:, 'c_' + str(year)] = usnews[str(year)].values
    usnews.loc[:, 'c_' + str(year)] = rankdata(usnews[str(year)].values, method='ordinal') - 1

In [11]:
university = grouped_df['EmployerName'].unique()
n_obs = grouped_df.shape[0]
for idx in range(n_obs):
    uni = grouped_df.iloc[idx]['EmployerName']
    year = grouped_df.iloc[idx]['Year']
#     grouped_df.loc[idx, 'uni_group_5'] = usnews[usnews['University Name'] == uni]['c_' + str(year)].values//2
    grouped_df.loc[idx, 'uni_rank'] = usnews[usnews['University Name'] == uni]['c_' + str(year)].values

In [12]:
for position in prof:
    for year in np.arange(2013, 2022, 1):
        sliced_idx = (grouped_df['Year'] == year) & (grouped_df['Position'] == position)
        data = grouped_df[sliced_idx]['WageRank'].values
        wage_list = list(rankdata(-data, method='ordinal') - 1)
        wage_breaks = getJenksBreaks(wage_list, n_group)
        n_obs = data.shape[0]
        wage_group = np.zeros(n_obs)
        for fidx in range(n_obs):
            wage_group[fidx] = classify(wage_list[fidx], wage_breaks)
        grouped_df.loc[sliced_idx, 'wage_group'] = n_group - wage_group
#         grouped_df.loc[sliced_idx, 'wage_group_5'] = (rankdata(data, method='ordinal') - 1)//2
#         grouped_df.loc[sliced_idx, 'wage_group_9'] = rankdata(data, method='ordinal') - 1

In [13]:
for position in prof:
    for year in np.arange(2013, 2022, 1):
        sliced_idx = (grouped_df['Year'] == year) & (grouped_df['Position'] == position)
        data = grouped_df[sliced_idx]['uni_rank'].values
        uni_list = list(rankdata(-data, method='ordinal') - 1)
        uni_breaks = getJenksBreaks(uni_list, n_group)
        n_obs = data.shape[0]
        uni_group = np.zeros(n_obs)
        for fidx in range(n_obs):
            uni_group[fidx] = classify(uni_list[fidx], uni_breaks)
        grouped_df.loc[sliced_idx, 'uni_group'] = n_group - uni_group

In [14]:
grouped_df

Unnamed: 0,EmployerName,WageRank,Year,Position,uni_rank,wage_group,uni_group
0,"University of California, Berkeley",9054.0,2021,Teachg Asst-Gship,1.0,3.0,0.0
1,"University of California, Davis",6111.0,2021,Teachg Asst-Gship,5.0,2.0,2.0
2,"University of California, Irvine",6129.0,2021,Teachg Asst-Gship,4.0,2.0,2.0
3,"University of California, Los Angeles",9669.0,2021,Teachg Asst-Gship,0.0,4.0,0.0
4,"University of California, Merced",4389.5,2021,Teachg Asst-Gship,8.0,0.0,4.0
...,...,...,...,...,...,...,...
398,"University of California, Merced",636.0,2013,Prof-Ay-B/E/E,7.0,2.0,3.0
399,"University of California, Riverside",717.0,2013,Prof-Ay-B/E/E,8.0,3.0,4.0
400,"University of California, San Diego",555.0,2013,Prof-Ay-B/E/E,2.0,1.0,1.0
401,"University of California, Santa Barbara",500.0,2013,Prof-Ay-B/E/E,4.0,1.0,2.0


In [15]:
np.unique(grouped_df[(grouped_df['Position'] == prof[4]) & (grouped_df['Year'] == 2019)]['wage_group'].values, return_counts=True)

(array([0., 1., 2., 3., 4.]), array([2, 2, 2, 2, 1], dtype=int64))

In [17]:
grouped_df.to_csv('uc_salary.csv', index=False)

In [None]:
# No missing data !!!
universities = grouped_df['EmployerName'].unique()
for uni in universities:
    for year in np.arange(2013, 2022, 1):
        sliced_idx = (grouped_df['Year'] == year) & (grouped_df['EmployerName'] == uni)
        data = grouped_df[sliced_idx]['Position'].nunique()
        if data != 5:
            print(uni, year)
            print(grouped_df[sliced_idx]['Position'].unique())

In [None]:
grouped_df