In [13]:
import pandas as pd
import scipy
import numpy as np
import plotly.express as px

In [3]:
df = pd.read_csv("F22ProcessedData.csv")
print(df.columns)

df = pd.read_csv("S23ProcessedData.csv")
print(df.columns)

df = pd.read_csv("F23ProcessedData.csv")
print(df.columns)

df = pd.read_csv("S24ProcessedData.csv")
print(df.columns)

Index(['Unnamed: 0', 'R1: Excitement', 'SWE R2 I1: Deliverable',
       'SWE R2 I1: Technical Communication', 'SWE R2 I1: Learning Ability',
       'SWE R2 I1: Teamwork & Communication', 'SWE R2 I1: Overall',
       'SWE R2 I2: Deliverable', 'SWE R2 I2: Technical Communication',
       'SWE R2 I2: Learning Ability', 'SWE R2 I2: Teamwork & Communication',
       'SWE R2 I2: Overall', 'Accepted?', 'Duration (Semesters)',
       'Current Member?'],
      dtype='object')
Index(['Unnamed: 0', 'R1: Excitement', 'SWE R2 I1: Deliverable',
       'SWE R2 I1: Technical Communication', 'SWE R2 I1: Learning Ability',
       'SWE R2 I1: Teamwork & Communication', 'SWE R2 I1: Overall',
       'SWE R2 I2: Deliverable', 'SWE R2 I2: Technical Communication',
       'SWE R2 I2: Learning Ability', 'SWE R2 I2: Teamwork & Communication',
       'SWE R2 I2: Overall', 'SWE R2 Status', 'Accepted?',
       'Duration (Semesters)', 'Current Member?'],
      dtype='object')
Index(['Unnamed: 0', 'R1: Excitement', 

## Calculate Correlation Between Interviewing Scores and Accepted

In [4]:
def acceptedAndDurationCorr(file_name):
    df = pd.read_csv(file_name)

    # List of features
    columns_interview = []
    for column_i in list(df.columns)[1:]:
        columns_interview.append(column_i)
        if column_i == 'SWE R2 I2: Overall' or column_i == 'SWE R2 Average Score':
            break
    
    # Found appropriate correlation measure here: https://www.ce.memphis.edu/7012/L17_CategoricalVariableAssociation.pdf 
    
    corr_list = []
    for column_i in columns_interview:
        corr_list.append((
            column_i, 
            scipy.stats.pointbiserialr(np.array(df[column_i]), np.array(df['Accepted?'])),
            scipy.stats.pearsonr(np.array(df[column_i]), np.array(df['Duration (Semesters)'])),
        ))

    print("TOP 5 R Accepted Values")
    corr_list.sort(key = lambda item : item[1].statistic)
    
    for key, val, val2  in corr_list[-5:]:
        print(key, val.statistic)

    print("TOP 5 R Duration Values")
    corr_list.sort(key = lambda item : item[2].statistic)
    
    for key, val, val2 in corr_list[-5:]:
        print(key, val2.statistic)

    print()

   

In [5]:
for file_i in ["F22ProcessedData.csv", "S23ProcessedData.csv", "F23ProcessedData.csv", "S24ProcessedData.csv"]:
    print(file_i)
    acceptedAndDurationCorr(file_name = file_i)
    

F22ProcessedData.csv
TOP 5 R Accepted Values
SWE R2 I1: Deliverable 0.5379823748652963
SWE R2 I1: Teamwork & Communication 0.548833100977617
SWE R2 I1: Learning Ability 0.5695210740924566
SWE R2 I1: Technical Communication 0.6058791811846899
SWE R2 I1: Overall 0.6422802539779227
TOP 5 R Duration Values
SWE R2 I1: Deliverable 0.47523317704262996
SWE R2 I1: Learning Ability 0.49340944747388493
SWE R2 I1: Teamwork & Communication 0.5267490804694394
SWE R2 I1: Technical Communication 0.575128035872991
SWE R2 I1: Overall 0.611803405523612

S23ProcessedData.csv
TOP 5 R Accepted Values
SWE R2 I2: Learning Ability 0.5034816842488848
SWE R2 I2: Overall 0.5069288611926808
SWE R2 I2: Teamwork & Communication 0.5200911513951968
SWE R2 I1: Overall 0.5271935748519105
SWE R2 I2: Deliverable 0.5327694141696595
TOP 5 R Duration Values
SWE R2 I2: Learning Ability 0.42035141025478173
SWE R2 I2: Deliverable 0.43092853926630714
SWE R2 I2: Overall 0.4332510240032247
SWE R2 I1: Overall 0.43789953035615203
SW

## Plot Columns Across Years

### Look at columns in common

In [6]:
from operator import and_, or_
from functools import reduce

columns_list = []
columns_total = set()
for file_name in ["F22ProcessedData.csv", "S23ProcessedData.csv", "F23ProcessedData.csv", "S24ProcessedData.csv"]:
    df_i = pd.read_csv(file_name)
    columns_i = list(df_i.columns)

    columns_list.append(set(columns_i))
    columns_total.update(columns_i)

print("In Common across years:", set(reduce(and_, columns_list)) )
print("Total:", columns_total)
print("Not in every year:", columns_total - set(reduce(and_, columns_list)))

In Common across years: {'Accepted?', 'R1: Excitement', 'Current Member?', 'SWE R2 I2: Technical Communication', 'SWE R2 I1: Overall', 'SWE R2 I1: Technical Communication', 'Unnamed: 0', 'SWE R2 I2: Deliverable', 'SWE R2 I2: Learning Ability', 'SWE R2 I1: Learning Ability', 'SWE R2 I1: Deliverable', 'SWE R2 I2: Overall', 'Duration (Semesters)'}
Total: {'Accepted?', 'SWE R2 I2: Technical Communication', 'SWE R2 I1: Passion/Engagement', 'SWE R2 I2: Deliverable', 'SWE R2 I1: Learning Ability', 'SWE R2 I2: Overall', 'SWE R2 I1: Passion/Engagement.1', 'R1: Excitement', 'SWE R2 Average Score', 'SWE R2 I1: Teamwork & Communication', 'R1 Average Score', 'SWE R2 I1: Deliverable', 'R1: Overall', 'SWE R2 I1: Overall', 'SWE R2 Status', 'Unnamed: 0', 'SWE Performance Score', 'SWE R2 I2: Teamwork & Communication', 'Duration (Semesters)', 'Current Member?', 'SWE R2 I1: Technical Communication', 'SWE R2 I2: Learning Ability'}
Not in every year: {'R1: Overall', 'SWE R2 I1: Passion/Engagement', 'SWE R2 

### Combine csvs

In [69]:
data = []

max_duration_dict = {
    'F22': 4,
    'S23': 3,
    'F23': 2,
    'S24': 1
}

for file_name in ["F22ProcessedData.csv", "S23ProcessedData.csv", "F23ProcessedData.csv", "S24ProcessedData.csv"]:
    df_i = pd.read_csv(file_name, index_col=False)
    df_i = df_i.drop('Unnamed: 0', axis=1)

    df_i = df_i.assign(season=file_name[:3]) 
    df_i=df_i.assign(avg_overall=(df_i['SWE R2 I1: Overall'] + df_i['SWE R2 I2: Overall'])/2)
    df_i=df_i.assign(duration_percent=(100*df_i['Duration (Semesters)']/max_duration_dict[file_name[:3]]))

    data.append(df_i)

frame = pd.concat(data, ignore_index = True, sort=False)

assert set(frame.columns) == (columns_total.union(set(['season']), set(['avg_overall']), set(['duration_percent'])) - set(['Unnamed: 0']))

### Colored Scatterplot

In [74]:
#look at influence of overall score on acceptance rates
fig = px.histogram(frame, x="SWE R2 I1: Overall", color="Accepted?")
fig.show()
for score in range(6):
    count_accepted = frame[(frame["Accepted?"] == 1) & (frame["SWE R2 I1: Overall"] == score)].shape[0]
    percent_accepted = 100*count_accepted/frame[frame["SWE R2 I1: Overall"] == score].shape[0]
    print("percent accepted with score " + str(score) + " : " + str(percent_accepted))

fig = px.histogram(frame, x="SWE R2 I2: Overall", color="Accepted?")
fig.show()
for score in range(6):
    count_accepted = frame[(frame["Accepted?"] == 1) & (frame["SWE R2 I2: Overall"] == score)].shape[0]
    percent_accepted = 100*count_accepted/frame[frame["SWE R2 I2: Overall"] == score].shape[0]
    print("percent accepted with score " + str(score) + " : " + str(percent_accepted))

#if both 5s whats your chance of acceptance lol
count_accepted = frame[(frame["Accepted?"] == 1) & (frame["SWE R2 I1: Overall"] == 5) & (frame["SWE R2 I2: Overall"] == 5)].shape[0]
count_total = frame[(frame["SWE R2 I1: Overall"] == 5) & (frame["SWE R2 I2: Overall"] == 5)].shape[0]
percent_accepted = 100*count_accepted/count_total
print("with two fives you have a " + str(percent_accepted) + " percent chance of acceptance with " + str(count_total) + " total candidates")

# #look at how distribution of overall scores changes across seasons
# fig = px.histogram(frame, x="avg_overall", color="season")
# fig.show()

# fig = px.histogram(frame, x="SWE R2 I1: Overall", color="season")
# fig.show()

# fig = px.histogram(frame, x="SWE R2 I2: Overall", color="season")
# fig.show()

# for season in ["F22", "S23", "F23", "S24"]:
#     print("applicants in " + str(season) + ": " + str(frame[(frame["season"] == season)].shape[0]))

#overall scores against duration
#THIS IS NOT CURRENTLY A RELEVANT MEASURE BC ILLOGICAL COMPARISONS
# fig = px.histogram(frame[((frame["season"] == 'F22') | (frame["season"] == 'S23')) & (frame["Accepted?"] == 1)], x="avg_overall", color="duration_percent")
# fig.update_xaxes(range=[0, 5])
# fig.show()


percent accepted with score 0 : 5.882352941176471
percent accepted with score 1 : 3.5714285714285716
percent accepted with score 2 : 0.0
percent accepted with score 3 : 24.0
percent accepted with score 4 : 42.3728813559322
percent accepted with score 5 : 82.6923076923077


percent accepted with score 0 : 11.475409836065573
percent accepted with score 1 : 0.0
percent accepted with score 2 : 33.333333333333336
percent accepted with score 3 : 34.61538461538461
percent accepted with score 4 : 47.82608695652174
percent accepted with score 5 : 81.81818181818181
with two fives you have a 88.46153846153847 percent chance of acceptance with 26 total candidates
