In [1]:
import pandas as pd
import re

In [2]:
def parse_line(line):
    attributes = {}
    pairs = re.findall(r'(\w+): (\S+)', line)
    for key, value in pairs:
        attributes[key] = value
    return attributes

# Function to create DataFrame from input text
def create_dataframe(filename, col="groundtruth"):
    df = pd.read_csv(filename, sep=',')
    data = []

    for _, row in df.iterrows():
        attributes = parse_line(row[col])
        attributes["sex"] = row["sex"]
        data.append(attributes)

    return pd.DataFrame(data)

# Create DataFrame
df_gt = create_dataframe("data_raw.csv")

## Parse `groundtruth` column

In [3]:
df_gt

Unnamed: 0,surname,firstname,occupation,link,age,sex,employer,birth_date,lob,civil_status,observation
0,Chardon,Marie,idem,fille,30,femme,,,,,
1,Lhopital,Louis-Jean,sp,chef,67,homme,,,,,
2,Papin,Marie,idem,idem,15,femme,idem,,,,
3,Lavocat,Marie,,femme,,femme,,1875,Rigny,,
4,Benne,Marguerite,,,78,femme,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
236,Burlurut,Pétronille,sans,épouse,61,femme,,,,,
237,Combey,Alexandre,idem,son,39,homme,,,,,
238,Collin,Marguerite,idem,épouse,38,femme,idem,,,,
239,Dumont,Etienne,,fils,,homme,,1900,idem,,


In [4]:
cols = df_gt.columns.tolist()
cols = cols[:5] + cols[6:] + [cols[5]]
cols

['surname',
 'firstname',
 'occupation',
 'link',
 'age',
 'employer',
 'birth_date',
 'lob',
 'civil_status',
 'observation',
 'sex']

In [23]:
df_gt.to_csv("gt_processed.csv", index=False)

In [6]:
cols = ['surname',
        'firstname',
        'occupation',
        'link',
        'age',
        'sex']

In [8]:
df_gt = df_gt[cols]
df_gt

Unnamed: 0,surname,firstname,occupation,link,age,sex
0,Chardon,Marie,idem,fille,30,femme
1,Lhopital,Louis-Jean,sp,chef,67,homme
2,Papin,Marie,idem,idem,15,femme
3,Lavocat,Marie,,femme,,femme
4,Benne,Marguerite,,,78,femme
...,...,...,...,...,...,...
236,Burlurut,Pétronille,sans,épouse,61,femme
237,Combey,Alexandre,idem,son,39,homme
238,Collin,Marguerite,idem,épouse,38,femme
239,Dumont,Etienne,,fils,,homme


In [9]:
# Function to transform text in DataFrame to lowercase
def lowercase_text(df):
    for column in ["surname", "firstname", "occupation", "link"]:
        if df[column].dtype == 'object':  # Check if column contains text
            df[column] = df[column].str.lower()
    return df

# Transform text to lowercase
df_lower_gt = lowercase_text(df_gt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower()


In [10]:
df_lower_gt

Unnamed: 0,surname,firstname,occupation,link,age,sex
0,chardon,marie,idem,fille,30,femme
1,lhopital,louis-jean,sp,chef,67,homme
2,papin,marie,idem,idem,15,femme
3,lavocat,marie,,femme,,femme
4,benne,marguerite,,,78,femme
...,...,...,...,...,...,...
236,burlurut,pétronille,sans,épouse,61,femme
237,combey,alexandre,idem,son,39,homme
238,collin,marguerite,idem,épouse,38,femme
239,dumont,etienne,,fils,,homme


In [11]:
df_lower_gt.to_csv("data.csv", index=False)

## Parse `prediction` column

In [20]:
df_pred = create_dataframe("transcriptions_with_sex.csv", col="prediction")
df_pred.sample(20)

Unnamed: 0,nom,prénom,date_naissance,lieux_naissance,sex,employeur,relation,profession,état_civil,éducation
211,Thély,Jean,1892,Coulange,homme,,id,id,,
44,Dumont,Etienne,1881,Coulonge,homme,patron!18371,chef,culviciste,,
178,Vacter,Marguerite,28,P,femme,,fomme,,v,
12,Raduron,Anne,53,s,femme,,p,,s,
202,Dory,Jacques,35,d,homme,,Chef,eublinateur,,
10,Cofain,Jacques,12,P,homme,,,,,
221,Haureau,Célina,1887,Lasset,femme,,employée,,,
97,Demay,Joséphine,27,iant,femme,,,,M,
167,Bauchard,Joseph,1863,Cher,homme,,chef,naveur,,
234,Cognet,Ferd,53,sans,homme,,chef,,,


In [21]:
cols = df_pred.columns.tolist()
cols = cols[:4] + cols[5:] + [cols[4]]
cols

['nom',
 'prénom',
 'date_naissance',
 'lieux_naissance',
 'employeur',
 'relation',
 'profession',
 'état_civil',
 'éducation',
 'sex']

In [26]:
df_pred.to_csv("preds_processed.csv", index=False)

In [28]:
set(df_pred["éducation"])

{'m', nan}

In [29]:
set(df_pred["état_civil"])

{"'",
 "''",
 'C',
 'Ch',
 'Chef',
 'Cheus)',
 'D',
 'M',
 'S',
 'V',
 'Vitant',
 'id',
 'm',
 'marns',
 'marnt',
 'me',
 'meunicien',
 'mère',
 nan,
 's',
 'sse',
 'v'}

In [31]:
for s in set(df_pred["relation"]):
    print(s)

camprof
''
petite
cuisinière
C
nan
Gonde
belle
cultinière
enfant
ouvrier
d°
noraise
coiffeur
Chef
Par
Rouher
bend
chausne
domest
Empl
a
id
io
Emp
argentière
Ch
fimme
pard
f
D
bent
coms
patissier
Enfant
sa
d
Ep
P
fins
'h
emllisat
couturière
noant
fille
cullinière
sans
femme
employée
fils
B
par
epre
p
frère
F
chef
ep
domestique
ch
néant
se
ép
"
patron
s
inf
chemin
prof
fomme
nevur
éprest
enf
Chemin
chacon


In [None]:
cols= ['nom',
 'prénom',
 'date_naissance',
 'lieux_naissance',
 'employeur',
 'relation',
 'profession',
 'état_civil',
 'éducation',
 'sex']

## Processing `firstname_with_sex`

In [32]:
firstname_with_sex = pd.read_csv("firstname_with_sex.csv", sep=";")
firstname_with_sex.head()

Unnamed: 0,firstname,male,female
0,marie,10145,2390322
1,jean,1869615,6476
2,pierre,1475841,5047
3,jeanne,1765,1097397
4,françois,1089009,5951


In [33]:
firstname_with_sex["frequency"] = firstname_with_sex["female"] / (firstname_with_sex["female"] + firstname_with_sex["male"])


In [34]:
firstname_with_sex.to_csv("firstname_with_frequency.csv")