In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer

In [2]:
titanic = pd.read_csv('data/titanic.csv')

In [3]:
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
uncleaned_name = 'uncleaned'

In [5]:
titanic[uncleaned_name] = titanic['Sex']

In [6]:
messing_up_dict = {
    'female': ['kobieta', 'mujer', 'kobiet', 'dziewczyna', 'femme', 'famale'],
    'male': ['facet',  'Mr.', 'monsieur', 'm ale'],
}

In [7]:
for k, v in messing_up_dict.items():
    sampled_indexes = titanic[titanic['Sex']==k].sample(frac=0.2).index
    for i in sampled_indexes:
        titanic.loc[i, uncleaned_name] = np.random.choice(v)

In [8]:
titanic[uncleaned_name].value_counts()

male          462
female        251
m ale          39
monsieur       36
facet          24
Mr.            16
kobieta        13
femme          12
dziewczyna     12
mujer          11
famale          8
kobiet          7
Name: uncleaned, dtype: int64

In [9]:
uncleaned = titanic.groupby(uncleaned_name).agg({'PassengerId': 'count'}).reset_index()

In [10]:
# introduce mess in the data
sex = uncleaned[uncleaned_name]

In [11]:
# pretrained model for multi language problems
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased')
sentence_embeddings = sbert_model.encode(sex)

In [12]:
# business knowledge about number of instances in a group
pca = PCA(n_components=1)
pca_comp = pca.fit_transform(pd.DataFrame(sentence_embeddings))

In [13]:
# mapping to new vectors
titanic_ext = titanic.merge(pd.DataFrame({uncleaned_name: sex, 'pca': [el[0] for el in pca_comp]}))

In [14]:
# sanity check
# titanic_ext.groupby('Sex').agg({'pca': 'mean'})

In [15]:
rf = RandomForestClassifier()

In [16]:
y = titanic_ext['Survived']

In [21]:
X = titanic_ext[['pca']]
rf.fit(X, y)
y_pred_pca = rf.predict(X)
accuracy_score(y, y_pred_pca)

0.7867564534231201

In [22]:
X = pd.get_dummies(titanic_ext[['Sex']], drop_first=True)
rf.fit(X, y)
y_pred_dummy = rf.predict(X)
accuracy_score(y, y_pred_dummy)

0.7867564534231201

In [23]:
sum(y_pred_dummy!=y_pred_pca)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,