In [4]:
import json
import pandas as pd

info_file_gu = "/Users/annl/Desktop/GenieProjekt/Exjobb/GU-metadata-210310.json"
info_file_ch = "/Users/annl/Desktop/GenieProjekt/Exjobb/CTH-metadata-210310.json"

def get_df(json_file) : 
    with open(json_file) as datafile:
        data = json.load(datafile)
        df = pd.DataFrame(data)
        return df

df_gu = get_df(info_file_gu)
df_ch = get_df(info_file_ch)


In [10]:
df_gu['department'].unique()

array(['Institutionen för vårdvetenskap och hälsa',
       'Institutionen för sociologi och arbetsvetenskap', None,
       'Institutionen för tillämpad informationsteknologi',
       'Företagsekonomiska institutionen', 'Juridiska institutionen',
       'Graduate Business School',
       'Institutionen för ekonomi och samhälle',
       'Institutionen för socialt arbete', 'Institutionen för medicin',
       'Institutionen för språk och litteraturer',
       'Institutionen för journalistik och masskommunikation',
       'Statsvetenskapliga institutionen',
       'HDK-Valand, Högskolan för konst och design',
       'Institutionen för neurovetenskap och fysiologi',
       'Institutionen för didaktik och pedagogisk profession',
       'Institutionen för kulturvård',
       'Institutionen för svenska språket',
       'Institutionen för litteratur, idéhistoria och religion',
       'Institutionen för data- och informationsteknik',
       'Institutionen för kulturvetenskaper',
       'Instituti

In [6]:
# Fetching the full texts for the theses
import os 

text_dir_ch = "/Users/annl/Desktop/GenieProjekt/Exjobb/texts-CTH-ODR-nolayout/"
text_dir_gu = "/Users/annl/Desktop/GenieProjekt/Exjobb/texts-GUPEA-nolayout/"

# for old folder structure (dir/id/id.txt)
def fetch_text_(item_id,directory) :
    walk = os.walk(directory+str(item_id))
    data = []
    lines = []
    for root, _, files in walk :
        
        for filename in files:
            file_path = os.path.join(root, filename)
            with open(file_path) as f:
                lines.append(f.readlines())
    return lines

import filecmp

def fetch_text(item_id,directory) :
    files = []
    for file in os.listdir(directory) : 
         # could this be done more efficiently?
        if file.startswith(str(item_id) + "--"):
            files.append(os.path.join(directory,file))
                
    if len(files) == 2 : # Keep just one in case of duplicates
        if filecmp.cmp(files[0], files[1]) : 
            files = [files[0]]
    
    lines = []
    
    for filename in files:
       
        with open(filename) as f:
                lines_ = f.readlines()
                lines_ = ''.join([l1 for l in lines_ for l1 in l])
                lines.append(lines_)
    
    lines = ''.join(lines)
    return lines


df_ch_text = df_ch['id'].apply(fetch_text,args=[text_dir_ch])
df_gu_text = df_gu['item_id'].apply(fetch_text,args=[text_dir_gu])

In [19]:
df_ch['text'] = df_ch_text
df_gu['text'] = df_gu_text

In [8]:
#indices of gu theses that are empty
empties_gu = []

for i in range(len(df_gu_text)) : 
    if len(df_gu_text[i]) == 0 :
        empties_gu.append(i)
  
# indices of chalmers theses that are empty
empties_ch = []
for i in range(len(df_ch_text)) :
    if len(df_ch_text[i]) == 0 : 
           empties_ch.append(i)
           

In [10]:
print(len(empties_gu))
print(len(empties_ch))
print(len(df_ch))

469
4559
17578


In [11]:
# Fixing the department labels

import ast
import numpy as np
import re


def removeStrs(st1,sts) :
    if sts == [] : return st1
    else : 
            newst = st1.replace(sts.pop(0),' ')
            return removeStrs(newst,sts)

depts_file = open("departments.txt", "r")
contents = depts_file.read()
dictionary = ast.literal_eval(contents)
depts_file.close()

facs_file = open("faculties.txt", "r")
contents2 = facs_file.read()
facs_dictionary = ast.literal_eval(contents2)
facs_file.close()


def fix_dept_name(name,uni) :
    if pd.isna(name) : return name
    if uni == "gu" : 
        name = name.lower()
        name = removeStrs(name,[";",",",".","/"])
        name = removeStrs(name,["göteborgs universitet","university of gothenburg","göteborg university","gothenburg university"])
    name = (re.sub(' +', ' ', name.strip()))
    if name in dictionary : 
        name = dictionary[name]
    else : () 
    rmstr = "Chalmers tekniska högskola / "
    name = name.replace(rmstr,'')
    return name

# Only for GU!
def get_faculty_name(name) : 
    if pd.isna(name) or name == '' : return name
    return facs_dictionary[name]
    



In [13]:
# Setting the correct department and faculty labels

dept_names_gu = df_gu['department'].apply(fix_dept_name,args=["gu"])
dept_names_ch = df_ch['department'].apply(fix_dept_name,args=["ch"])

df_gu['department'] = dept_names_gu
df_ch['department'] = dept_names_ch

fac_names_gu = df_gu['department'].apply(get_faculty_name)
df_gu['faculty'] = fac_names_gu

In [14]:
# Empty departments
nodepts_gu = df_gu[df_gu['department'] == ""]
nodepts_ch = df_ch[df_ch['department'] == ""]

In [15]:
# Getting the authors' first names

# In the GU info file all authors are in a single string
def getFirstNames_GU(st) :
    st = st.replace('\n', ' ')
    if " ; " in st : # multiple authors separated by semi-colon
                     # either "lastname, firstname" or "firstname lastname"
        return [ name.split(", ")[1].split(" ")[0] if "," in name else name.split(" ")[0]
                for name in (st.split(" ; "))]
    
    elif ", " in st : # Could be multiple authors separated by comma OR a single author with "lastname, firstname"
        names = st.split(", ")
        if len(names) == 2 : 
            return [names[1].split(" ")[0]]
        else : 
                return list(set([ name.split(" ")[0] if len(name.split(" ")) > 1 else st.split(", ")[1].split(" ")[0]
                        for name in (st.split(", "))]))    
    elif " " in st  : [st.split(" ")[0]] # Single author "firstname lastname"
    else : [st]
        
        
# In the Chalmers info file, author names are in a list of strings        
def getFirstNames_ch(sts) : return [ nam.split(", ")[1].split()[0] if ", " in nam else nam.split()[0] for nam in sts ]


# listing first names by thesis
firstnames_gu = [getFirstNames_GU(df_gu['creator'][i]) for i in range(len(df_gu))]
firstnames_ch = [getFirstNames_ch(df_ch['authors'][i]) for i in range(len(df_ch))]

In [16]:
# Getting the gender composition for the authors

from ast import literal_eval

# Precomposed file with a name-gender mapping
genderize_file = open('/Users/annl/Desktop/GenieProjekt/Exjobb/name_gender', 'r')
mapping = genderize_file.readlines()

name_gender_dict = {}
for name_gender in mapping :
    name   = literal_eval(name_gender)['name']
    gender = literal_eval(name_gender)['gender']
    name_gender_dict[name] = gender
    
    
def authors2gender(authorlists) :
  return  [[  name_gender_dict[nam.lower()] if nam.lower() in name_gender_dict else  None for nam in nams] 
           if nams is not None else [None] for nams in authorlists ]
    

genders_gu = authors2gender(firstnames_gu)
genders_ch = authors2gender(firstnames_ch)

def gender_composition(genderlist) :
    fem = 'female' in genderlist
    masc = 'male' in genderlist
    non  = None in genderlist

    if non : composition = 'unknown'
    else : 
        if fem : 
            if not masc : composition = 'female'
            else : composition = 'mixed'        
        else : composition = 'male'
    return composition
            
gender_comp_gu    = [  gender_composition(gends) for gends in genders_gu]
gender_comp_ch    = [  gender_composition(gends) for gends in genders_ch]

In [20]:
# Check if language label is correct
from langdetect import detect

# Get the mid part of the thesis, since the abstract and references are sometimes in a different language
def detect_(text) : 
    try: lang = detect(mid_chunk_of_text(text))
    except : lang = None
    return lang


def mid_chunk_of_text(text) :
    l = len(text)
    l1 = l*0.25
    l2 = l*0.75
    return text[int(l1):int(l2)]


langs_gu = [detect_(text) for text in df_gu['text']]
langs_ch = [detect_(text) for text in df_ch['text']]

In [22]:
df_gu['actual_language'] = langs_gu
df_ch['actual_language'] = langs_ch


In [24]:
df_ch[0:2]

Unnamed: 0,id,title,year,authors,department,language,type,abstract,keywords,handle,links,gender_composition,text,actual_language
0,300813,Using agile capabilities to increase flexibili...,2020,"[Andersson, Filip, Björklund, Anton]",Institutionen för teknikens ekonomi och organi...,eng,Examensarbete för masterexamen,,[],https://hdl.handle.net/20.500.12380/300813,[https://odr.chalmers.se/bitstream/20.500.1238...,male,Using agile capabilities to\nincrease flexibil...,en
1,161892,Urinseparerande avloppssystem - En utvärdering...,1998,"[Yonan, S.E.]",Institutionen för arkitektur och samhällsbyggn...,swe,Examensarbete för masterexamen,,"[Vattenteknik, Water Engineering]",https://hdl.handle.net/20.500.12380/161892,[https://odr.chalmers.se/bitstream/20.500.1238...,unknown,CHALMERS TEKNISKA HOGSKOLA\nInstitutionen for ...,sv


In [25]:
df_gu[0:2]

Unnamed: 0,item_id,handle,titles,creator,department,subject,date_issued,language,abstract,degree,links,faculty,text,actual_language
0,26152,http://hdl.handle.net/2077/24256,Lekens betydelse i den dagliga undervisningen,"Thorn, Therese",Institutionen för sociologi och arbetsvetenskap,,,sv,Syfte: Syftet med arbetet är att undersöka hur...,C,https://gupea.ub.gu.se/bitstream/2077/24256/1/...,Samhällsvetenskapliga fakulteten,Lekens betydelse i den dagliga undervisningen\...,sv
1,30364,http://hdl.handle.net/2077/28177,En studie ur ett genusperspektiv av mötet mell...,"Ahlberg, Helene ; Forsdahl, Eva ; Mansouri, Shila",Institutionen för sociologi och arbetsvetenskap,,,sv,Föreliggande examensarbete är en undersökning ...,M2,https://gupea.ub.gu.se/bitstream/2077/28177/1/...,Samhällsvetenskapliga fakulteten,En studie ur ett genusperspektiv av mötet mell...,sv


In [77]:
# Getting gender statistic for departments and faculties

def gendercounts_by_col(df,gender,col) : 
    counts = df[df['gender_composition'] == gender][col].value_counts()
    return dict(zip(counts.index.tolist(),counts))

def all_gendercounts(df,col) : 
    
    dep_counts_male = gendercounts_by_col(df,'male',col) 

    dep_counts_female = gendercounts_by_col(df,'female',col)

    dep_counts_mixed = gendercounts_by_col(df,'mixed',col) 

    dep_counts_unknown = gendercounts_by_col(df,'unknown',col)

    all_depts = list(set(list(dep_counts_male.keys()) + 
                         list(dep_counts_female.keys()) +  
                         list(dep_counts_mixed.keys()))) 
                  
    rows = []
    for dept in all_depts :
        try : a = dep_counts_male[dept]
        except : a = 0
        try : b = dep_counts_female[dept]
        except : b = 0
        try : c = dep_counts_mixed[dept]
        except : c = 0
        try : d = (dep_counts_unknown[dept]) 
        except : d = 0
        rows.append([dept,a,b,c,d]) 
    return rows

In [78]:
df_gu['gender_composition'] = gender_comp_gu
df_ch['gender_composition'] = gender_comp_ch

In [79]:
df_gu_eng = df_gu[df_gu['actual_language'] == 'en']
df_gu_swe = df_gu[df_gu['actual_language'] == 'sv']
df_ch_eng = df_ch[df_ch['actual_language'] == 'en']
df_ch_swe = df_ch[df_ch['actual_language'] == 'sv']

In [80]:
all_gcounts = [all_gendercounts(df,'department') for df in [df_gu_swe, df_gu_eng,df_ch_swe,df_ch_eng]]

In [98]:
all_gcounts_faculty = [all_gendercounts(df,'faculty') for df in [df_gu_swe, df_gu_eng]]

gcounts_gu_faculty = [(a[:-1]+b[1:]) for a in all_gcounts_faculty[0] for b in all_gcounts_faculty[1] if a[0] == b[0]]
df_gcounts_gu_faculty = pd.DataFrame(gcounts_gu_faculty)
df_gcounts_gu_faculty.columns = (['Faculty'] + header[1:])


In [99]:
gcounts_gu_dept = [(a[:-1]+b[1:]) for a in all_gcounts[0] for b in all_gcounts[1] if a[0] == b[0]]
gcounts_ch_dept = [(a[:-1]+b[1:]) for a in all_gcounts[2] for b in all_gcounts[3] if a[0] == b[0]]


In [100]:
df_gcounts_gu_faculty = df_gcounts_gu_faculty[1:] # Remove row of unknown faculty
df_gcounts_gu_faculty = df_gcounts_gu_faculty.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)
df_gcounts_gu_faculty = add_summary_row(df_gcounts_gu_faculty,'Faculty')
df_gcounts_gu_faculty

Unnamed: 0,Faculty,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng,unknown
0,Samhällsvetenskapliga fakulteten,1233,3786,509,317,562,14,21
1,Sahlgrenska akademin,199,1828,216,105,176,8,4
2,Utbildningsvetenskapliga fakulteten,264,1689,119,55,92,1,1
3,Handelshögskolan,1225,1631,541,1249,1107,450,38
4,Humanistiska fakulteten,423,958,10,195,307,1,4
5,Konstnärliga fakulteten,227,433,4,83,124,3,7
6,Naturvetenskapliga fakulteten,182,364,38,62,61,6,3
7,IT-fakulteten,420,265,104,497,238,129,26
8,Total:,4173,10954,1541,2563,2667,612,104


In [102]:
def add_summary_row(gc,typ) : 
    new_row = {typ : 'Total:', 'male Swe':gc['male Swe'].sum(),
           'female Swe':gc['female Swe'].sum(),
           'mixed Swe':gc['mixed Swe'].sum(),
           'mixed Eng':gc['mixed Eng'].sum(),
           'female Eng':gc['female Eng'].sum(),
           'male Eng':gc['male Eng'].sum(),
           'unknown' : gc['unknown'].sum()}
    gc = gc.append(new_row,ignore_index=True)  
    return gc

df_gcounts_gu = pd.DataFrame(gcounts_gu_dept)
df_gcounts_ch = pd.DataFrame(gcounts_ch_dept)

gcounts_no_dept_gu  = df_gcounts_gu[0:1]
gcounts_no_dept_ch  = df_gcounts_ch[0:1]

df_gcounts_gu = df_gcounts_gu[1:]
df_gcounts_ch = df_gcounts_ch[1:]

header  = ['Department','male Swe','female Swe','mixed Swe','male Eng','female Eng','mixed Eng','unknown']
df_gcounts_ch.columns = header
df_gcounts_gu.columns = header


df_gcounts_gu = df_gcounts_gu.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)
df_gcounts_ch = df_gcounts_ch.sort_values(by=['female Swe','female Eng'], ascending=False).reset_index(drop=True)



gcounts_gu_dept = add_summary_row(df_gcounts_gu,'Department')
gcounts_ch_dept = add_summary_row(df_gcounts_ch,'Department')

In [83]:
gcounts_gu_dept

Unnamed: 0,Department,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng,unknown
0,Institutionen för sociologi och arbetsvetenskap,438,1694,235,61,120,5,8
1,Institutionen för vårdvetenskap och hälsa,135,1391,165,3,19,5,0
2,Företagsekonomiska institutionen,815,950,481,310,341,147,7
3,Institutionen för socialt arbete,135,914,113,28,102,6,3
4,Institutionen för pedagogik och specialpedagogik,92,741,43,20,43,0,0
5,Juridiska institutionen,299,539,6,48,91,5,2
6,Institutionen för journalistik och masskommuni...,206,442,120,13,19,3,0
7,Institutionen för kost- och idrottsvetenskap,90,429,55,17,14,1,0
8,Institutionen för svenska språket,107,370,3,1,1,0,0
9,Institutionen för didaktik och pedagogisk prof...,60,364,13,5,8,0,1


In [107]:
gcounts_ch_dept

Unnamed: 0,Department,male Swe,female Swe,mixed Swe,male Eng,female Eng,mixed Eng,unknown
0,Institutionen för arkitektur och samhällsbyggn...,529,415,202,1044,860,166,55
1,Institutionen för industri- och materialvetenskap,227,81,129,550,208,118,23
2,Institutionen för rymd- och geovetenskap,50,21,44,567,235,79,24
3,Institutionen för biologi och bioteknik,7,19,28,218,178,5,9
4,Institutionen för mekanik och maritima vetensk...,178,19,58,756,119,80,42
5,Institutionen för elektroteknik,163,16,73,861,159,102,21
6,Institutionen för data och informationsteknik,183,9,62,1047,158,174,48
7,Institutionen för kemi och kemiteknik,12,7,35,120,98,9,3
8,Institutionen för fysik,13,3,19,390,126,21,9
9,COMESA,1,3,0,3,2,0,0


In [110]:
import dataframe_image as dfi
import imgkit

dfi.export(gcounts_ch_dept,"gendercounts_ch_department.png")
dfi.export(gcounts_gu_dept,"gendercounts_gu_department.png")
dfi.export(df_gcounts_gu_faculty,"gendercounts_gu_faculty.png")

In [112]:
df_gu.to_csv("gu.csv")
df_ch.to_csv("ch.csv")

In [113]:
df_gu.to_json("/Users/annl/Desktop/GenieProjekt/cleaned_GUPEA-studentuppsatser-210305.json",orient='records',lines=True)
df_ch.to_json("/Users/annl/Desktop/GenieProjekt/cleaned_CTH-ODR-studentuppsatser-210305.json",orient='records',lines=True)