In [13]:
import pandas as pd
import sys
import numpy as np

path_to_data = "/Users/annl/Desktop/GenieProjekt/jobads/"

# Remove "tillkännagivanden" and duplicate ads
forbidden_strings = 'tillkänn|tillsätt|beslut'

def remove_non_ads(df) :
    df = df[~(df['variant_name'].str.lower().str.contains(forbidden_strings,na = False))]
    df = df[~(df['prefix_text'].str.lower().str.contains(forbidden_strings,na = False))]  
    df = df.drop_duplicates(subset='text')
    df = df.reset_index(drop=True)
    return df

def prepare_dataframe(file) : 
    df = remove_non_ads(pd.read_csv(path_to_data + file))
    return df

df_ltu_en = prepare_dataframe("jobads_ltu_en.csv")
df_ch_en  = prepare_dataframe("jobads_ch_en.csv")
df_gu_en  = prepare_dataframe("jobads_gu_en.csv")
df_ltu_sv = prepare_dataframe("jobads_ltu_sv.csv")
df_ch_sv  = prepare_dataframe("jobads_ch_sv.csv")
df_gu_sv  = prepare_dataframe("jobads_gu_sv.csv")



In [14]:

# pair Swedish and English ads based on project identifier

cols = ['project_id','id','text']

def merge_on_id(df1,df2) :
  
    df = pd.merge(df1[cols],df2[cols], on = "project_id")
    df = df.drop_duplicates(subset='text_x')
    df = df.drop_duplicates(subset='id_y').reset_index()
    return df

ch_merged       = merge_on_id(df_ch_sv,df_ch_en)
ltu_merged      = merge_on_id(df_ltu_sv,df_ltu_en)
gu_merged       = merge_on_id(df_gu_sv,df_gu_en)

all_merged = ltu_merged.append(ch_merged.append(gu_merged))

# All english ads with a Swedish equivalent
df_en_all = all_merged['text_y'].reset_index()

# All Swedish ads with an english equivalent
df_sv_all = all_merged['text_x'].reset_index()

In [15]:
from nltk.translate import gale_church
from nltk import sent_tokenize

# Splits the swedish and english texts into sentences and 
# aligns them according to the gale church algorithm, to get pairs of 
# sentences that are translations of each other.
def align_sentences(text_swe,text_eng) : 
    
    sents_swe = sent_tokenize(text_swe)
    sents_eng = sent_tokenize(text_eng)
    
    # splitting into paragraphs first makes the splits more fine-grained
    paragraphs1 = [p for p in text_swe.split('\n') if p]
    paragraphs2 = [p for p in text_eng.split('\n') if p]

    sents1 = []
    sents2 = []
    for p in paragraphs1 :
        sents = sent_tokenize(p)
        sents1 = sents1 + sents
    for p in paragraphs2 :
        sents = sent_tokenize(p)
        sents2 = sents2 + sents

    lengths_swe = [len(s) for s in sents1]
    lengths_eng = [len(s) for s in sents2]

    alignment_indices = gale_church.align_blocks(lengths_swe,lengths_eng)
    alignment = ([(sents1[i],sents2[j]) for (i,j) in alignment_indices])     
    return alignment


In [16]:
# Aligned sentences of each Swedish-English text pair
aligned_texts = [align_sentences(df_sv_all.iloc[i]['text_x'],
                                 df_en_all.iloc[i]['text_y']) 
                 for i in range(len(df_sv_all))]


In [17]:
import sys
sys.path.append('../')
from genderdecoder import assess


assessments_all = [ assess(eng) for eng in df_en_all['text_y'].values]
                  
# All gendered words that occur in the English texts
coded_words_ = [ 
                ass['masculine_coded_words'] + 
                ass['feminine_coded_words']  
                for ass in (assessments_all)]

coded_words = list(set([ w for ls in coded_words_ for w in ls ]))

# All aligned sentences that contain an English gendered word
filtered_aligned_texts = [(swe,eng) for texts in aligned_texts 
                          for (swe,eng) in texts 
                              if any([coded_word in eng 
                                for coded_word in coded_words])  
                         ]

In [18]:
coded_words

['collaboration/program',
 'defend',
 'persistent',
 'agreement',
 'nurturing',
 'dominated',
 'persistence',
 'logic',
 'dependability',
 'supportive',
 'co-operate',
 'shares',
 'leadership',
 'sharing',
 'cooperate',
 'courage',
 'collaboratively',
 'enthusiastically',
 'commitment',
 'understandable',
 'independently/individually',
 'compete',
 'competitiveness',
 'leading',
 'responsibility',
 'cooperating',
 'decides',
 'self-confident',
 'cooperative',
 'actively',
 'competitive',
 'superior',
 'active',
 'leaderships',
 'intellectual',
 'collaborate',
 'determined',
 'connection',
 'collaborations',
 'yield',
 'dominating',
 'children',
 'emotional',
 'sensitivity',
 'dependent',
 'challenges',
 'responsi',
 'analyse',
 'determining',
 'intellect',
 'decision',
 'enthusiastic',
 'collaborates',
 'dominates',
 'objectives',
 'ambitious',
 'kindly',
 'collaborator',
 'analyses',
 'autonomy',
 'competences',
 'cooperates',
 'kinds',
 'determination',
 'competition',
 'understandin

In [19]:
import unicodedata
import re

# All words occuring in the Swedish ads
all_swedish_words_ = [re.findall('[\wöåä\-]+', text, flags=re.UNICODE+re.IGNORECASE) 
                        for (text,eng) in filtered_aligned_texts]
all_swedish_words  = list(set([w.lower() for ws in all_swedish_words_ for w in ws]))

In [20]:
all_swedish_words

['forsknings-',
 'framförallt',
 'främst',
 'bärförmågan',
 'supportsystem',
 'bakom',
 'fysisk',
 'militära',
 'designprocesser',
 'statsvetenskap',
 'ökade',
 'frågor',
 'utmaningarna',
 'masteruppsatser',
 'företagsekonomiskt',
 'amminex',
 'beredskapskritiska',
 'värdeutbyte',
 'licentiand',
 '2013',
 'het',
 'samverkansprocesser',
 'höggenomströmningsdata',
 'önskemål',
 'ges',
 'publik',
 'kallt',
 'kursutveckling',
 'differentiering',
 'boliden',
 'energivetenskap',
 'exjobb',
 'smörjsystem',
 'distributionsnät',
 'efterbehandling',
 '-metoder',
 'sulfidmineraliseringar',
 'steg',
 'föreskrifterna',
 'fernandez-rodriguez',
 '0911-72618',
 'bildanalysalgoritmer',
 'hyperspektrala',
 'drivkraft',
 'väggar',
 'sökande',
 'dynamik',
 'motor-',
 'multiple',
 'visa',
 'djupinlärning',
 'riktar',
 'bostadssituation',
 'examinerats',
 'biokemisk',
 'fluid-struktur',
 'bortforsling',
 'värmeåtervinning',
 'fall',
 'utmana',
 'variablernas',
 'båt',
 'carlberg',
 'k

In [21]:
# Occurrence matrix for all gendered English words over all sentences
occurrence_matrix_docs      = [[text.lower().count(w) for text in ((all_merged['text_y']).tolist())] for w in coded_words]
occurrence_matrix_sentences = [[eng.lower().count(w) for (swe,eng) in filtered_aligned_texts] for w in coded_words]


# Occurrence matrix for all occurring Swedish words over all sentences
occurrence_matrix_docs_sv      = [[text.lower().count(w) for text in ((all_merged['text_x']).tolist())] for w in all_swedish_words]
occurrence_matrix_sentences_sv = [[swe.lower().count(w) for (swe,eng) in filtered_aligned_texts ] 
                                     for w in all_swedish_words]

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

paired_words = []

# For each gendered word represented in the occurrence matrix,
# find the most similar Swedish word
for i in range (len(occurrence_matrix_sentences)) :
    cos_sims = [cosine_similarity([occurrence_matrix_sentences[i]],occurrence_matrix_sentences_sv)] 
  
    cos_sims = (cos_sims[0].tolist())[0]

    max_sim = max(cos_sims)
    index_max = cos_sims.index(max_sim)
    print("(" + coded_words[i] + ", " + all_swedish_words[index_max] + ")" + " " + str(max_sim))
    paired_words.append((coded_words[i],all_swedish_words[index_max],max_sim))

(collaboration/program, sverige-kina) 1.0
(defend, försvara) 0.6975184488828856
(persistent, miljögifterna) 0.5773502691896258
(agreement, överenskommelse) 0.39593773131515986
(nurturing, byggnadsdesign) 0.4472135954999579
(dominated, växthusdrivande) 0.4999999999999999
(persistence, avmontering) 1.0
(logic, gis) 0.4144268730094697
(dependability, driftsäkerhet) 0.8362420100070908
(supportive, gäster) 0.8498365855987976
(co-operate, klass) 0.18257418583505533
(shares, vindkraften) 0.7071067811865475
(leadership, ledarskap) 0.45176266575807206
(sharing, rawmatcop) 0.32444284226152503
(cooperate, efterfrågad) 0.22633936510629632
(courage, uppmuntra) 0.5343863402176421
(collaboratively, industriparter) 0.5163977794943223
(enthusiastically, hållbarare) 1.0
(commitment, livscykelåtaganden) 0.4629100498862758
(understandable, uppenbart) 0.7071067811865475
(independently/individually, presentationsteknik) 0.7071067811865475
(compete, kompe) 0.6389532301312268
(competitiveness, it-ser

In [23]:

paired_words_docs = []

# Same as above, but using similarities over documents instead of sentences
for i in range (len(occurrence_matrix_docs)) :
    cos_sims = [cosine_similarity([occurrence_matrix_docs[i]],occurrence_matrix_docs_sv)] 
  
    cos_sims = (cos_sims[0].tolist())[0]

    max_sim = max(cos_sims)
    index_max = cos_sims.index(max_sim)
    print("(" + coded_words[i] + ", " + all_swedish_words[index_max] + ")" + " " + str(max_sim))
    paired_words_docs.append((coded_words[i],all_swedish_words[index_max],max_sim))

(collaboration/program, mpc) 1.0
(defend, försvara) 0.8641264122753545
(persistent, oföränderlig) 0.816496580927726
(agreement, överenskommelse) 0.5060243137049899
(nurturing, bostadssituation) 1.0
(dominated, växthusdrivande) 0.5345224838248487
(persistence, avmontering) 1.0
(logic, geologisk) 0.6888051864876661
(dependability, driftsäkerhet) 0.819891591749923
(supportive, gäster) 0.8823529411764706
(co-operate, spann) 0.9999999999999998
(shares, fsi) 1.0
(leadership, ledarskap) 0.5763708787148328
(sharing, utvecklingsmiljöer) 0.6396021490668313
(cooperate, planerad) 0.3373495424699933
(courage, uppmuntra) 0.645709524069209
(collaboratively, laser-icp) 0.5773502691896258
(enthusiastically, yrkesperson) 1.0
(commitment, livscykelåtaganden) 0.6201736729460423
(understandable, licentiand) 1.0
(independently/individually, hybridfordon) 1.0
(compete, kompe) 0.6700328973383107
(competitiveness, examinerats) 0.5773502691896258
(leading, ledande) 0.8178668406284002
(responsibility, a