# Synthetic Graduate School Admission Data

## Package imports and settings

In [2268]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler

In [2269]:
np.random.seed(0)

## Define covariance matrices

In [2270]:
#Number of samples for each males and females
num_samples = 500
mean_m_Q = 156.1

#Male
r_m = np.array([
        [81, 28.15, 5.43],
        [28.15, 84.64, 1.16],
        [5.43, 1.16, 0.81]
    ])

mean_m = [150.7,mean_m_Q,3.5]

#Female
r_f = np.array([
        [65.61, 24.51, 4.34],
        [24.51, 79.21, 1],
        [4.34, 1, 0.64]
    ])

mean_f = [150.3,151.2,3.7]

## Sample GRE scores from multivariate Gaussian

In [2271]:
y_m = np.random.multivariate_normal(mean=mean_m, cov=r_m, size=num_samples)
y_f = np.random.multivariate_normal(mean=mean_f, cov=r_f, size=num_samples)

## Round values to match official score ranges

In [2272]:
#Round values
y_m = np.round(y_m*2.0)/2.0
y_f = np.round(y_f*2.0)/2.0
np.transpose(y_m)[0] = np.round(np.transpose(y_m)[0])
np.transpose(y_m)[1] = np.round(np.transpose(y_m)[1])
np.transpose(y_f)[0] = np.round(np.transpose(y_f)[0])
np.transpose(y_f)[1] = np.round(np.transpose(y_f)[1])

## Truncate values to match official score ranges

In [2273]:
for i in range(0,len(np.transpose(y_m)[0])):
    #Male
    if np.transpose(y_m)[0][i]>170:
        np.transpose(y_m)[0][i]=170
    elif np.transpose(y_m)[0][i]<130:
        np.transpose(y_m)[0][i]=130
    if np.transpose(y_m)[1][i]>170:
        np.transpose(y_m)[1][i]=170
    elif np.transpose(y_m)[1][i]<130:
        np.transpose(y_m)[1][i]=130
    if np.transpose(y_m)[2][i]>6:
        np.transpose(y_m)[2][i]=6
    elif np.transpose(y_m)[2][i]<0:
        np.transpose(y_m)[2][i]=0
    
    #Female
    if np.transpose(y_f)[0][i]>170:
        np.transpose(y_f)[0][i]=170
    elif np.transpose(y_f)[0][i]<130:
        np.transpose(y_f)[0][i]=130
    if np.transpose(y_f)[1][i]>170:
        np.transpose(y_f)[1][i]=170
    elif np.transpose(y_f)[1][i]<130:
        np.transpose(y_f)[1][i]=130
    if np.transpose(y_f)[2][i]>6:
        np.transpose(y_f)[2][i]=6
    elif np.transpose(y_f)[2][i]<0:
        np.transpose(y_f)[2][i]=0

## Set up DataFrames

In [2274]:
df_m = pd.DataFrame(y_m)
df_f = pd.DataFrame(y_f)

df_m.columns = ['GRE_V','GRE_Q','GRE_AW']
df_m.insert(loc=0, column='Gender', value=1)
df_f.columns = ['GRE_V','GRE_Q','GRE_AW']
df_f.insert(loc=0, column='Gender', value=0)

## Scale GRE scores between 0 and 1

In [2275]:
#Scale
scaler_1 = MinMaxScaler()
scaler_2 = MinMaxScaler()
df_m[['GRE_V', 'GRE_Q', 'GRE_AW']] = scaler_1.fit_transform(df_m[['GRE_V', 'GRE_Q', 'GRE_AW']])
df_f[['GRE_V', 'GRE_Q', 'GRE_AW']] = scaler_2.fit_transform(df_f[['GRE_V', 'GRE_Q', 'GRE_AW']])

## Generate noisy 'scores' to determine labels

In [2276]:
#Parameter for discrimination
zeta = 3
#Parameter for weight on GRE Q
eta = 2
#noise_lower_bound = -5
#noise_upper_bound = 5
noise_mean = 0
noise_std = 0.1

#Labels Male
#noise_m = np.random.uniform(noise_lower_bound, noise_upper_bound, size=500)
noise_m = np.random.normal(noise_mean,noise_std,500)
df_m['Score'] = (zeta/(zeta+2+eta))*df_m.Gender+(1/(zeta+2+eta))*df_m.GRE_V+(eta/(zeta+2+eta))*df_m.GRE_Q+(1/(zeta+2+eta))*df_m.GRE_AW+noise_m

#Labels Female
#noise_f = np.random.uniform(noise_lower_bound, noise_upper_bound, size=500)
noise_f = np.random.normal(noise_mean,noise_std,500)
df_f['Score'] = (zeta/(zeta+2+eta))*df_f.Gender+(1/(zeta+2+eta))*df_f.GRE_V+(eta/(zeta+2+eta))*df_f.GRE_Q+(1/(zeta+2+eta))*df_f.GRE_AW+noise_f

In [2277]:
#noise_m
#noise_f

## Rescale GRE scores

In [2278]:
#Rescale
df_m[['GRE_V', 'GRE_Q', 'GRE_AW']] = scaler_1.inverse_transform(df_m[['GRE_V', 'GRE_Q', 'GRE_AW']])
df_f[['GRE_V', 'GRE_Q', 'GRE_AW']] = scaler_2.inverse_transform(df_f[['GRE_V', 'GRE_Q', 'GRE_AW']])

## Concatenate male and female data

In [2279]:
#Concatenate the frames
df = pd.concat([df_m, df_f],ignore_index=True)

## Generate labels

In [2280]:
#Labels
temp = list()
for i in range(0,len(df)):
    if df.Score[i]>0.5:
        temp.append(1)
    else:
        temp.append(0)
        
df['Label'] = temp
#df['Label'] = np.random.randint(2,size=1000)

In [2281]:
df.head()

Unnamed: 0,Gender,GRE_V,GRE_Q,GRE_AW,Score,Label
0,1,140.0,140.0,3.5,0.772964,1
1,1,144.0,130.0,2.5,0.587392,1
2,1,143.0,150.0,3.0,0.671344,1
3,1,148.0,152.0,4.5,0.779059,1
4,1,146.0,150.0,3.5,0.585408,1


## Flip labels

In [2282]:
#share_flipped_labels = 0.5

#idx = np.random.choice(len(df),int(len(df)*share_flipped_labels),replace=False)

In [2283]:
#idx

In [2284]:
#for i in idx:
#    if df.iloc[i]['Label'] == 0:
#        df.at[i,'Label'] = 1
#    elif df.iloc[i]['Label'] == 1:
#        df.at[i,'Label'] = 0

In [2285]:
#df.head()

## Shuffle data

In [2286]:
#Shuffle
df = df.sample(frac=1).reset_index(drop=True)

## Export to CSV

In [2287]:
#df.to_csv('../data/grad_school_synthetic_rescaled_theta_01_zeta_3.csv',index=False)

## Descriptive Stats

In [2288]:
df.describe()

Unnamed: 0,Gender,GRE_V,GRE_Q,GRE_AW,Score,Label
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.5,150.3,153.725,3.56,0.527071,0.521
std,0.50025,8.257129,8.67575,0.840301,0.268399,0.499809
min,0.0,130.0,130.0,0.5,-0.167008,0.0
25%,0.0,144.0,148.0,3.0,0.285083,0.0
50%,0.5,150.0,154.0,3.5,0.529142,1.0
75%,1.0,156.0,160.0,4.0,0.764363,1.0
max,1.0,170.0,170.0,6.0,1.182114,1.0


In [2289]:
df['Label'].value_counts()

1    521
0    479
Name: Label, dtype: int64

In [2290]:
print(len(df[(df['Gender'] == 0) & (df['Label'] == 0)]))
print(len(df[(df['Gender'] == 0) & (df['Label'] == 1)]))
print(len(df[(df['Gender'] == 1) & (df['Label'] == 0)]))
print(len(df[(df['Gender'] == 1) & (df['Label'] == 1)]))

470
30
9
491
