In [97]:
#Importing required packages for gender prediction
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from csv import writer
from sklearn.tree import DecisionTreeClassifier


#Importing required packages for creating gender bias visualisation
import matplotlib
import openpyxl
import seaborn as sns

#Importing required packages for generating geographical visualisation
import geonamescache
import folium
from folium.plugins import MarkerCluster



In [121]:
#training model and machine learning function to classify names. Inspired by: https://github.com/Jcharis/Python-Machine-Learning/blob/master/Gender%20Classification%20With%20%20Machine%20Learning/Gender%20Classification%20of%20Names%20With%20Machine%20Learning.ipynb
df = pd.read_csv('names_dataset.csv') #dataset with names and their attached gender
df_names = df
df_names.sex.replace({'F':0,'M':1},inplace=True)
Xfeatures =df_names['name']
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)
y = df_names.sex

def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }


features = np.vectorize(features)
df_X = features(df_names['name'])
df_y = df_names['sex']

corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)

dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        return("Female")
    else:
        return("Male")

In [None]:
#testing accruacy of ML model 
print(dclf.score(dv.transform(dfX_test), dfy_test)

In [None]:
#applying ML model to data, preliminary steps
def createList(n):
    lst = []
    for i in range(n+1):
        lst.append(i)
    return(lst)

list_entries = createList(1000)

filename = "C://Users/Laurens/Desktop/ODISSEI/Dataset_Version_2.csv" #Change to location where dataset is downloaded
dataset = pd.read_csv(filename, sep=';', date_parser = "Publication Year", infer_datetime_format = True)

In [None]:
#Creating dictionary with list of first names per research 
first_names_per_research_dict = {}
final_list_per_research = []

for authors in dataset["Author Full Names"]:
    list_first_names_per_research = []
    improved_list_first_names_per_research = []
    split_per_research = authors.split(";")
    
    for x in split_per_research: 
        try:
            list_first_names_per_research.append(x.split(",")[1])
        except: 
            list_first_names_per_research.append(x.split(",")[0])
    
    for f in list_first_names_per_research:
        split_string = f.split()[0]
        improved_list_first_names_per_research.append(split_string)
    
    final_list_per_research.append(improved_list_first_names_per_research)

for f, b in zip(final_list_per_research, list_entries):
        first_names_per_research_dict[b] = f

In [None]:
#calling names from dictionary, generate count of males/females
dictionary_genders = {} 
final_list_genders_per_research = []
list_male_female = []


for persons in first_names_per_research_dict.values(): #loop over different reports in dictionary
    list_of_genders = [] 
    
    for person in persons: #Loop over different individuals that contributed to 1 report 
        list_of_genders.append(genderpredictor1(person)) 
    
    final_list_genders_per_research.append(list_of_genders)
    
    for f, b in zip(final_list_genders_per_research, list_entries):
        dictionary_genders[b] = f

In [None]:
#exporting results 
with open('CSVFILE.csv', 'a', newline='') as f_object:  
    
    writer_object = writer(f_object)
    writer_object.writerow(final_list_per_research)  
    # Close the file object
    f_object.close()

In [None]:
#Function for generating longtiude and latitude based on city name 
def geolocate(city):
    gc = geonamescache.GeonamesCache()
    city_info = gc.search_cities(city)

    try:
        return(city_info[0]["latitude"],city_info[0]["longitude"])

    except:
        return("error city not found") 

In [118]:
#Part of tool that creates visualisation from data

def bias_discovery(filepath_geo, filepath_gender, type_of_visualisation):
    
    df = pd.read_csv(filepath_geo, delimiter=";")
    df_2 = pd.read_excel(filepath_gender, index_col=0)
    #df_gender = pd.read_csv(filepath_gender, sep=';', date_parser = "Publication Year", infer_datetime_format = True)

    if type_of_visualisation == "geographical":
        
        #Code inspired by https://towardsdatascience.com/using-python-to-create-a-world-map-from-a-list-of-country-names-cd7480d03b10
        world_map= folium.Map(tiles="cartodbpositron")

        marker_cluster = MarkerCluster().add_to(world_map) #for each coordinate, create circlemarker of user percent

        for i in range(len(df)):
            lat_temp = df.iloc[i]['Latitude']
            long_temp = df.iloc[i]['Longitude']

            lat = float(lat_temp)
            long = float(long_temp)

            radius=5
            popup_text = """City : {}<br>
                        % of Researches in search from area : {}<br>"""
            popup_text = popup_text.format(df.iloc[i]['City'],
                                    df.iloc[i]['Percentage_Total']
                                     )
            folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)

        return(world_map)
            

    elif type_of_visualisation == "gender":
        hist = df_2.hist(column = "Ratio female contributors")
        hist_2 = df_2.hist(column = "Count female", bins = 150)
        total_females = df_2["Count female"].sum()
        total_males = df_2["Count male"].sum()
        
        print("Total amount of females in your search is:" + " " + str(total_females) + " " + "Total amount of males in your search is:" + " " +  str(total_males))
        return(hist, hist_2)
        
        

In [120]:
#Example of finding geographical bias
filepath_geo_data = "C:/Users/laure/Desktop/ODDISEI/percentage_of_cities.csv" #change to location where dataset was downloaded

bias_discovery(filepath_geo=filepath_geo_data, filepath_gender=filepath_gender_data, type_of_visualisation="geographical")

In [None]:
#Example of finding gender bias
filepath_gender_data = "C:/Users/laure/Desktop/ODDISEI/Dataset_ODISSEI.xlsx" #change to location where dataset was downloaded

bias_discovery(filepath_geo=filepath_geo_data, filepath_gender=filepath_gender_data, type_of_visualisation="gender")