# Data and Libraries

In [1]:
import pandas as pd
import pickle
import time
import random
import matplotlib.pyplot as plt
import numpy as np
DATA_PATH = "data defi ia/"
categories_label = pd.read_csv(DATA_PATH+'/categories_string.csv')

In [9]:
df = pd.read_json(DATA_PATH+"/train.json")
test_df = pd.read_json(DATA_PATH+"/test.json")
df_label = pd.read_csv(DATA_PATH+"/train_label.csv")

In [10]:
df = df.merge(df_label,on='Id')
df['job'] = categories_label.loc[df['Category'].values,'0'].values
df["description_lower"] = [x.lower() for x in df.description]
df.head()

Unnamed: 0,Id,description,gender,Category,job,description_lower
0,0,She is also a Ronald D. Asmus Policy Entrepre...,F,19,professor,she is also a ronald d. asmus policy entrepre...
1,1,He is a member of the AICPA and WICPA. Brent ...,M,9,accountant,he is a member of the aicpa and wicpa. brent ...
2,2,Dr. Aster has held teaching and research posi...,M,19,professor,dr. aster has held teaching and research posi...
3,3,He runs a boutique design studio attending cl...,M,24,architect,he runs a boutique design studio attending cl...
4,4,"He focuses on cloud security, identity and ac...",M,24,architect,"he focuses on cloud security, identity and ac..."


In [11]:
test_df["description_lower"] = [x.lower() for x in test_df.description]
test_df.head()

Unnamed: 0,Id,description,gender,description_lower
3,0,She currently works on CNN’s newest primetime...,F,she currently works on cnn’s newest primetime...
6,1,Lavalette’s photographs have been shown widel...,M,lavalette’s photographs have been shown widel...
11,2,Along with his academic and professional deve...,M,along with his academic and professional deve...
17,3,She obtained her Ph.D. in Islamic Studies at ...,F,she obtained her ph.d. in islamic studies at ...
18,4,She studies issues of women and Islam and has...,F,she studies issues of women and islam and has...


## Text stemming :

In [34]:
import nltk
t0 = time.time()
ps = nltk.stem.PorterStemmer()

test_df['description_lower_stemmed'] = test_df['description_lower'].apply(
            lambda text : ' '.join(ps.stem(word) for word in text.split()))
print('test dataset took',time.time() - t0 ,'seconds')
test_df.head()

test dataset took 1603889092.0931044 seconds


Unnamed: 0,Id,description,gender,description_lower,description_lower_stemmed
3,0,She currently works on CNN’s newest primetime...,F,she currently works on cnn’s newest primetime...,"she current work on cnn’ newest primetim show,..."
6,1,Lavalette’s photographs have been shown widel...,M,lavalette’s photographs have been shown widel...,lavalette’ photograph have been shown wide and...
11,2,Along with his academic and professional deve...,M,along with his academic and professional deve...,along with hi academ and profession developmen...
17,3,She obtained her Ph.D. in Islamic Studies at ...,F,she obtained her ph.d. in islamic studies at ...,she obtain her ph.d. in islam studi at duke un...
18,4,She studies issues of women and Islam and has...,F,she studies issues of women and islam and has...,she studi issu of women and islam and ha writt...


In [35]:
t0 = time.time()
ps = nltk.stem.PorterStemmer()

df['description_lower_stemmed'] = df['description_lower'].apply(
            lambda text : ' '.join(ps.stem(word) for word in text.split()))
print('train dataset took',time.time() - t0 ,'seconds')
df.head()

train dataset took 1603889354.955367 seconds


Unnamed: 0,Id,description,gender,description_lower,description_lower_stemmed
3,0,She currently works on CNN’s newest primetime...,F,she currently works on cnn’s newest primetime...,"she current work on cnn’ newest primetim show,..."
6,1,Lavalette’s photographs have been shown widel...,M,lavalette’s photographs have been shown widel...,lavalette’ photograph have been shown wide and...
11,2,Along with his academic and professional deve...,M,along with his academic and professional deve...,along with hi academ and profession developmen...
17,3,She obtained her Ph.D. in Islamic Studies at ...,F,she obtained her ph.d. in islamic studies at ...,she obtain her ph.d. in islam studi at duke un...
18,4,She studies issues of women and Islam and has...,F,she studies issues of women and islam and has...,she studi issu of women and islam and ha writt...


## Indifferenciation :

Firstly, we will replace all words like he-she-her-his by their male equivalent : "he-he-his-his".
By doing this we will conserve the meaning of the sentences and decorrelate the job from the gender (partially).
It will also help decorrelate first names from gender, but not from proffessions.
In order to do this, we created an excel file that will make the correspondance between the word to be replaced and its target word. A sort of "converter".

This is why in a second time, we will replace every first name we found by an unique first or word, it does not matter.
We are using this file to detect every first name and replace it :


https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/vital-events/names/babies-first-names/full-lists-of-babies-first-names-2010-to-2014




### Removing first names :

In [36]:
english_names = pd.read_excel("list_of_english_names.xlsx")
male_names = pd.Series.dropna(english_names.loc[6:450,"Unnamed: 1"]).values
female_names = pd.Series.dropna(english_names.loc[6:450,"Unnamed: 5"]).values
names = list(male_names) + list(female_names)
names = [name.lower() for name in names]
for name in names :
    if len(name)<3 :
        names.remove(name)
random.shuffle(names)
print('5 first names : ', names[:5],f'\n{len(names)} first names in total.')

5 first names :  ['billy', 'ibrahim', 'albie', 'belle', 'scott'] 
890 first names in total.


In [38]:
t0 = time.time()
test_df['description final'] = test_df['description_lower_stemmed'].apply(lambda text : ' '.join(word for word in text.split() if not word in names)
).values
print("done in {} seconds".format(int(time.time() - t0)))
test_df.head()

done in 35 seconds


Unnamed: 0,Id,description,gender,description_lower,description_lower_stemmed,description final
3,0,She currently works on CNN’s newest primetime...,F,she currently works on cnn’s newest primetime...,"she current work on cnn’ newest primetim show,...","she current work on cnn’ newest primetim show,..."
6,1,Lavalette’s photographs have been shown widel...,M,lavalette’s photographs have been shown widel...,lavalette’ photograph have been shown wide and...,lavalette’ photograph have been shown wide and...
11,2,Along with his academic and professional deve...,M,along with his academic and professional deve...,along with hi academ and profession developmen...,along with hi academ and profession developmen...
17,3,She obtained her Ph.D. in Islamic Studies at ...,F,she obtained her ph.d. in islamic studies at ...,she obtain her ph.d. in islam studi at duke un...,she obtain her ph.d. in islam studi at duke un...
18,4,She studies issues of women and Islam and has...,F,she studies issues of women and islam and has...,she studi issu of women and islam and ha writt...,she studi issu of women and islam and ha writt...


In [40]:
t0 = time.time()
df['description final'] = df['description_lower_stemmed'].apply(lambda text : ' '.join(word for word in text.split() if not word in names)).values
print("done in {} seconds".format(int(time.time() - t0)))
df.head()

done in 144 seconds


Unnamed: 0,Id,description,gender,Category,job,description_lower,description_lower_stemmed,description final
0,0,She is also a Ronald D. Asmus Policy Entrepre...,F,19,professor,she is also a ronald d. asmus policy entrepre...,she is also a ronald d. asmu polici entreprene...,she is also a ronald d. asmu polici entreprene...
1,1,He is a member of the AICPA and WICPA. Brent ...,M,9,accountant,he is a member of the aicpa and wicpa. brent ...,he is a member of the aicpa and wicpa. brent g...,he is a member of the aicpa and wicpa. brent g...
2,2,Dr. Aster has held teaching and research posi...,M,19,professor,dr. aster has held teaching and research posi...,dr. aster ha held teach and research posit at ...,dr. aster ha held teach and research posit at ...
3,3,He runs a boutique design studio attending cl...,M,24,architect,he runs a boutique design studio attending cl...,he run a boutiqu design studio attend client i...,he run a boutiqu design studio attend client i...
4,4,"He focuses on cloud security, identity and ac...",M,24,architect,"he focuses on cloud security, identity and ac...","he focus on cloud security, ident and access m...","he focus on cloud security, ident and access m..."


## Learning vocabularies specific to professions :

In this cell, for each profession we obtain the 75 words which are the most specific to that very profession.
We add theses words to a global vocabulary.

Next, we will use this global vocabulary to filter out words that does not belong to it.

By doing this we hope to catch the specific vocabulary for each profession with 

In [74]:
GLOBAL_VOCABULARY = []
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
jobs = categories_label['0'].values
for job in tqdm(jobs) :
    current_data = df.loc[df['job'] == job,]
    current_transformer = CountVectorizer(stop_words='english', max_features=150)
    current_transformer.fit(current_data['description final']);
    GLOBAL_VOCABULARY += current_transformer.get_feature_names()
GLOBAL_VOCABULARY = np.unique(GLOBAL_VOCABULARY)
print(len(GLOBAL_VOCABULARY))

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:20<00:00,  1.38it/s]


1169


In [75]:
test_df['final_description'] = test_df['description_lower'].apply(lambda text : ' '.join(word for word in text.split() if word in GLOBAL_VOCABULARY))
print("test df done")
df['final_description'] = df['description_lower'].apply(lambda text : ' '.join(word for word in text.split() if word in GLOBAL_VOCABULARY))

test df done


In [48]:
test_df.to_json("preprocessed/preprocessed_test2.json")
df.to_json("preprocessed/preprocessed_train2.json")

## Loading pre-processed data :

In [49]:
test_df = pd.read_json("preprocessed/preprocessed_test2.json")
df = pd.read_json("preprocessed/preprocessed_train2.json")

Here we are splitting the train_dataset in 90% training and 10% validation

In [106]:
from sklearn.model_selection import train_test_split
df_train, df_validation = train_test_split(df, test_size=.1,shuffle=True,random_state= 42)

### Text vectorization :

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from tensorflow.keras.utils import to_categorical

transformer = TfidfVectorizer(stop_words=None)

X_train = transformer.fit_transform(df_train['final_description'].values).toarray()
Y_train = df_train['Category'].values#to_categorical(df_train['Category'].values)

X_validation  = transformer.transform(df_validation['final_description'].values).toarray()
Y_validation = df_validation['Category'].values#to_categorical(df_validation['Category'].values)

X_train.shape

(195477, 906)

KeyboardInterrupt: 

## Deep neural network :

In [79]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout

In [84]:
NUM_CATEGORIES = 28
activation_function = 'tanh'
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1],activation="exponential"))
model.add(Dense(128, activation=activation_function))
model.add(Dropout(.25))
model.add(Dense(128, activation=activation_function))


model.add(Dense(NUM_CATEGORIES,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['CategoricalAccuracy'])

In [86]:
model.fit(X_train,Y_train, epochs = 5,validation_data = (X_validation, Y_validation),batch_size= 64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x175938f4f98>

In [None]:
model.score(X_validation,Y_validation)

## Macro F1 Score : on 20% of the training set (our validation set)

In [33]:
from sklearn.metrics import f1_score

y_validation = df_validation['Category'].values
y_pred = model.predict_classes(X_validation);
print(f1_score(y_validation, y_pred, average='macro'))


0.6713323622815818


## Measuring the Fairness score :

In [34]:
X_test  = transformer.transform(test_df['final_description'].values).toarray()
test_df['Category'] =  model.predict_classes(X_test)
test_df['job'] = categories_label.loc[test_df['Category'],'0'].values
test_df.head()

Unnamed: 0,Id,description,gender,description_lower,description final,final_description,Category,job
3,0,She currently works on CNN’s newest primetime...,F,she currently works on cnn’s newest primetime...,currently works cnns newest primetime parker s...,currently works guest culture previously produ...,6,journalist
6,1,Lavalette’s photographs have been shown widel...,M,lavalette’s photographs have been shown widel...,lavalettes photographs shown widely editorial ...,photographs shown widely editorial work publis...,20,photographer
11,2,Along with his academic and professional deve...,M,along with his academic and professional deve...,academic professional development gabriel live...,academic professional studied worked new exper...,19,professor
17,3,She obtained her Ph.D. in Islamic Studies at ...,F,she obtained her ph.d. in islamic studies at ...,obtained phd islamic studies duke university s...,obtained studies university specializes women ...,19,professor
18,4,She studies issues of women and Islam and has...,F,she studies issues of women and islam and has...,studies issues women islam written polygamy af...,studies issues women written science,19,professor


## Fairness metric

Goal : be as close as the minimum value which is 1.

In [35]:
# function used to calculate the fairness score :

def macro_disparate_impact(people):
    counts = people.groupby(['job', 'gender']).size().unstack('gender')
    counts['disparate_impact'] = counts[['M', 'F']].max(axis='columns') / counts[['M', 'F']].min(axis='columns')
    return counts['disparate_impact'].mean()

In [36]:
print('\nFairness metric for the training set :',macro_disparate_impact(df))
print('\nFairness metric for the test set     :',macro_disparate_impact(test_df))


Fairness metric for the training set : 3.898171170378378

Fairness metric for the test set     : 3.065824823528619


# File Generation

In [None]:
test_df["Category"] = predictions
baseline_file = test_df[["Id","Category"]]
baseline_file.to_csv("/kaggle/working/baseline.csv", index=False)

In [113]:
df['description_lower'][13]

" she earned her bachelor of arts degree from hofstra university and holds a master's degree in teaching from montclair state university. vicky has been performing for over 35 years and has directed productions for the past 25 years, both in the community and professional venues. her teaching credits include instructing students at northwestern university and the roundabout theatre company in new york city."