###  Display tables of the language and gender distributions for each department at GU and Chalmers


In [49]:
info_file_gu = "data/GU-metadata-210310.json.gz"
info_file_ch = "data/CTH-metadata-210310.json.gz"

In [50]:
import pandas as pd

df_gu = pd.read_json(info_file_gu)
df_ch = pd.read_json(info_file_ch)

In [29]:
languages_gu = []

for row in df_gu['inferred'] : 
    languages_gu.append(row['language'])
    
languages_ch = []

for row in df_ch['inferred'] : 
    languages_ch.append(row['language'])
    
genders_gu = []
for row in df_gu['inferred'] : 
    genders_gu.append(row['gender'])
    
genders_ch = []
for row in df_ch['inferred'] : 
    genders_ch.append(row['gender'])

In [30]:
df_gu['lang'] = languages_gu
df_ch['lang'] = languages_ch

df_gu['gender'] = genders_gu
df_ch['gender'] = genders_ch

df_gu = df_gu[df_gu['gender'] != 'unknown']
df_ch = df_ch[df_ch['gender'] != 'unknown']
#df_gu = df_gu[df_gu['gender'] != 'mixed']
#df_ch = df_ch[df_ch['gender'] != 'mixed']

In [31]:
df_gu_sv = df_gu[df_gu['lang'] == 'sv']
df_gu_eng = df_gu[df_gu['lang'] == 'en']

df_ch_sv = df_ch[df_ch['lang'] == 'sv']
df_ch_eng = df_ch[df_ch['lang'] == 'en']


In [32]:
# Getting gender statistics for departments and faculties

def gendercounts_by_col(df,gender,col) : 
    counts = df[df['gender'] == gender][col].value_counts()
    return dict(zip(counts.index.tolist(),counts))

def all_gendercounts(df,col) : 
    
    dep_counts_male = gendercounts_by_col(df,'male',col) 

    dep_counts_female = gendercounts_by_col(df,'female',col)

    dep_counts_mixed = gendercounts_by_col(df,'mixed',col) 

    dep_counts_unknown = gendercounts_by_col(df,'unknown',col)

    all_depts = list(set(list(dep_counts_male.keys()) + 
                         list(dep_counts_female.keys()) +  
                         list(dep_counts_mixed.keys()))) 
                  
    rows = []
    for dept in all_depts :
        try : a = dep_counts_male[dept]
        except : a = 0
        try : b = dep_counts_female[dept]
        except : b = 0
        try : c = dep_counts_mixed[dept]
        except : c = 0
     #   try : d = (dep_counts_unknown[dept]) 
     #   except : d = 0
        rows.append([dept,a,b,c]) 
    return rows

In [33]:
all_gcounts = [all_gendercounts(df,'department') for df in [df_gu_sv, df_gu_eng,df_ch_sv,df_ch_eng]]
all_gcounts_faculty = [all_gendercounts(df,'faculty') for df in [df_gu_sv, df_gu_eng]]

In [34]:
gcounts_gu_dept = [(a+b[1:]) for a in all_gcounts[0] for b in all_gcounts[1] if a[0] == b[0]]
gcounts_ch_dept = [(a+b[1:]) for a in all_gcounts[2] for b in all_gcounts[3] if a[0] == b[0]]

In [35]:
gcounts_gu_fac = [(a+b[1:]) for a in all_gcounts_faculty[0] for b in all_gcounts_faculty[1] if a[0] == b[0]]

In [36]:
def add_summary_row(gc,typ) : 
    new_row = {typ : 'Total:', 'male Swe':gc['male Swe'].sum(),
           'female Swe':gc['female Swe'].sum(),
           'mixed Swe':gc['mixed Swe'].sum(),
          
           'female Eng':gc['female Eng'].sum(),
           'male Eng':gc['male Eng'].sum(),
            'mixed Eng':gc['mixed Eng'].sum(),}
         #  'unknown' : gc['unknown'].sum()}
    gc = gc.append(new_row,ignore_index=True)  
    return gc

df_gcounts_gu = pd.DataFrame(gcounts_gu_dept)
df_gcounts_ch = pd.DataFrame(gcounts_ch_dept)

df_gcounts_gu_fac = pd.DataFrame(gcounts_gu_fac)

gcounts_no_dept_gu  = df_gcounts_gu[0:1]
gcounts_no_dept_ch  = df_gcounts_ch[0:1]

df_gcounts_gu = df_gcounts_gu[1:]
df_gcounts_ch = df_gcounts_ch[1:]

header  = ['Department','male Swe','female Swe','mixed Swe','male Eng','female Eng','mixed Eng']
header_fac = ['Faculty','male Swe','female Swe','mixed Swe','male Eng','female Eng','mixed Eng']
df_gcounts_ch.columns = header
df_gcounts_gu.columns = header

df_gcounts_gu_fac.columns = header_fac


df_gcounts_gu = df_gcounts_gu.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)
df_gcounts_ch = df_gcounts_ch.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)


df_gcounts_gu_fac = df_gcounts_gu_fac.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)

gcounts_gu_fac = add_summary_row(df_gcounts_gu_fac,'Faculty')
gcounts_gu_dept = add_summary_row(df_gcounts_gu,'Department')
gcounts_ch_dept = add_summary_row(df_gcounts_ch,'Department')

  gc = gc.append(new_row,ignore_index=True)
  gc = gc.append(new_row,ignore_index=True)
  gc = gc.append(new_row,ignore_index=True)


In [37]:
gcounts_gu_fac

Unnamed: 0,Faculty,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng
0,Samhällsvetenskapliga fakulteten,1233,3779,508,316,559,12
1,Sahlgrenska akademin,199,1826,216,105,176,8
2,Utbildningsvetenskapliga fakulteten,264,1689,118,54,92,1
3,Handelshögskolan,1221,1631,541,1246,1103,447
4,Humanistiska fakulteten,422,956,10,193,305,1
5,Konstnärliga fakulteten,227,432,4,80,119,3
6,Naturvetenskapliga fakulteten,183,363,37,58,60,6
7,IT-fakulteten,420,265,104,494,237,129
8,Total:,4169,10941,1538,2546,2651,607


In [38]:
gcounts_gu_dept

Unnamed: 0,Department,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng
0,Institutionen för sociologi och arbetsvetenskap,438,1689,235,60,118,4
1,Institutionen för vårdvetenskap och hälsa,135,1391,165,3,19,5
2,Företagsekonomiska institutionen,814,950,481,311,339,146
3,Institutionen för socialt arbete,135,913,113,28,102,6
4,Institutionen för pedagogik och specialpedagogik,92,741,43,20,43,0
5,Juridiska institutionen,296,539,6,46,88,4
6,Institutionen för journalistik och masskommuni...,206,441,119,13,19,2
7,Institutionen för kost- och idrottsvetenskap,90,429,54,17,14,1
8,Institutionen för didaktik och pedagogisk prof...,60,364,13,5,8,0
9,Institutionen för kulturvård,133,337,3,6,33,0


In [39]:
gcounts_ch_dept

Unnamed: 0,Department,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng
0,Institutionen för arkitektur och samhällsbyggn...,527,408,202,1026,847,164
1,Institutionen för teknikens ekonomi och organi...,80,63,124,498,253,190
2,Institutionen för rymd- och geovetenskap,50,21,44,567,235,79
3,Institutionen för biologi och bioteknik,7,19,28,218,178,5
4,Institutionen för mekanik och maritima vetensk...,178,19,58,755,119,80
5,Institutionen för elektroteknik,163,16,73,858,160,102
6,Institutionen för data och informationsteknik,183,9,62,1045,158,174
7,Institutionen för kemi och kemiteknik,12,7,35,120,98,9
8,Institutionen för fysik,13,3,19,390,126,21
9,COMESA,1,3,0,3,2,0


In [40]:
humaniora = ['Institutionen för filosofi, lingvistik och vetenskapsteori','Institutionen för litteratur, idéhistoria och religion','Institutionen för språk och litteraturer','Institutionen för svenska språket']
arkitektur  = ['Institutionen för arkitektur och samhällsbyggnadsteknik']
ekonomi = ['Företagsekonomiska institutionen']
naturvetenskap = ['Institutionen för biologi och bioteknik','Institutionen för fysik','Institutionen för kemi och kemiteknik','Institutionen för rymd- och geovetenskap']

counts_human = (df_gu[df_gu['department'].isin(humaniora)]['gender'].value_counts())
counts_ark = (df_ch[df_ch['department'].isin(arkitektur)]['gender'].value_counts())
counts_eko = (df_gu[df_gu['department'].isin(ekonomi)]['gender'].value_counts())
counts_natur = (df_ch[df_ch['department'].isin(naturvetenskap)]['gender'].value_counts())

counts_human_sv = (df_gu_sv[df_gu_sv['department'].isin(humaniora)]['gender'].value_counts())
counts_ark_sv = (df_ch_sv[df_ch_sv['department'].isin(arkitektur)]['gender'].value_counts())
counts_eko_sv = (df_gu_sv[df_gu_sv['department'].isin(ekonomi)]['gender'].value_counts())
counts_natur_sv = (df_ch_sv[df_ch_sv['department'].isin(naturvetenskap)]['gender'].value_counts())

counts_human_en = (df_gu_eng[df_gu_eng['department'].isin(humaniora)]['gender'].value_counts())
counts_ark_en = (df_ch_eng[df_ch_eng['department'].isin(arkitektur)]['gender'].value_counts())
counts_eko_en = (df_gu_eng[df_gu_eng['department'].isin(ekonomi)]['gender'].value_counts())
counts_natur_en = (df_ch_eng[df_ch_eng['department'].isin(naturvetenskap)]['gender'].value_counts())


In [41]:
all_counts = dict(zip(["humaniora","arkitektur","ekonomi","naturvetenskap"]
                      ,[dict(zip(counts.index.tolist(),counts)) 
                        for counts in [counts_human,counts_ark,counts_eko,counts_natur]]))

all_counts_eng = dict(zip(["humaniora","arkitektur","ekonomi","naturvetenskap"]
                      ,[dict(zip(counts.index.tolist(),counts)) 
                        for counts in [counts_human_en,counts_ark_en,counts_eko_en,counts_natur_en]]))

all_counts_sv = dict(zip(["humaniora","arkitektur","ekonomi","naturvetenskap"]
                      ,[dict(zip(counts.index.tolist(),counts)) 
                        for counts in [counts_human_sv,counts_ark_sv,counts_eko_sv,counts_natur_sv]]))
              


In [42]:
total_hum = all_counts['humaniora']['male'] + all_counts['humaniora']['female']
total_ark = all_counts['arkitektur']['male'] + all_counts['arkitektur']['female']
total_eko = all_counts['ekonomi']['male'] + all_counts['ekonomi']['female']
total_natur = all_counts['naturvetenskap']['male'] + all_counts['naturvetenskap']['female']

all_counts['humaniora']['total'] = total_hum
all_counts['arkitektur']['total'] = total_ark
all_counts['ekonomi']['total'] = total_eko
all_counts['naturvetenskap']['total'] = total_natur


In [43]:
all_counts

{'humaniora': {'female': 1286, 'male': 612, 'mixed': 7, 'total': 1898},
 'arkitektur': {'male': 2010, 'female': 1594, 'mixed': 420, 'total': 3604},
 'ekonomi': {'female': 1300, 'male': 1129, 'mixed': 633, 'total': 2429},
 'naturvetenskap': {'male': 2055, 'female': 1053, 'mixed': 288, 'total': 3108}}

In [44]:
all_counts_sv

{'humaniora': {'female': 766, 'male': 346, 'mixed': 6},
 'arkitektur': {'male': 527, 'female': 408, 'mixed': 202},
 'ekonomi': {'female': 950, 'male': 814, 'mixed': 481},
 'naturvetenskap': {'mixed': 126, 'male': 82, 'female': 50}}

In [45]:
all_counts_eng

{'humaniora': {'female': 271, 'male': 184, 'mixed': 1},
 'arkitektur': {'male': 1026, 'female': 847, 'mixed': 164},
 'ekonomi': {'female': 339, 'male': 311, 'mixed': 146},
 'naturvetenskap': {'male': 1295, 'female': 637, 'mixed': 114}}

In [46]:
pd.DataFrame.from_dict(all_counts,orient='index')

Unnamed: 0,female,male,mixed,total
humaniora,1286,612,7,1898
arkitektur,1594,2010,420,3604
ekonomi,1300,1129,633,2429
naturvetenskap,1053,2055,288,3108


In [47]:
pd.DataFrame.from_dict(all_counts_sv,orient='index')

Unnamed: 0,female,male,mixed
humaniora,766,346,6
arkitektur,408,527,202
ekonomi,950,814,481
naturvetenskap,50,82,126


In [48]:
pd.DataFrame.from_dict(all_counts_eng,orient='index')

Unnamed: 0,female,male,mixed
humaniora,271,184,1
arkitektur,847,1026,164
ekonomi,339,311,146
naturvetenskap,637,1295,114
