In [197]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer

In [198]:
titanic = pd.read_csv('https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv')

In [199]:
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [200]:
uncleaned_name = 'uncleaned'

In [201]:
titanic[uncleaned_name] = titanic['Sex']

In [217]:
messing_up_dict = {
    'female': ['kobieta', 'mujer', 'kobiet', 'dziewczyna', 'femme', 'famale'],
    'male': ['facet',  'Mr.', 'monsieur', 'm ale'],
}

In [218]:
for k, v in messing_up_dict.items():
    sampled_indexes = titanic[titanic['Sex']==k].sample(frac=0.2).index
    for i in sampled_indexes:
        titanic.loc[i, uncleaned_name] = np.random.choice(v)

In [219]:
titanic[uncleaned_name].value_counts()

male          375
female        200
Mr.            68
facet          63
monsieur       51
femme          27
kobiet         25
m ale          20
mujer          19
kobieta        18
dziewczyna     16
famale          9
Name: uncleaned, dtype: int64

In [220]:
uncleaned = titanic.groupby(uncleaned_name).agg({'PassengerId': 'count'}).reset_index()

In [221]:
# introduce mess in the data
sex = uncleaned[uncleaned_name]

In [222]:
# pretrained model for multi language problems
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased')
sentence_embeddings = sbert_model.encode(sex)

In [223]:
# business knowledge about number of instances in a group
pca = PCA(n_components=1)
pca_comp = pca.fit_transform(pd.DataFrame(sentence_embeddings))

In [224]:
# mapping to new vectors
titanic_ext = titanic.merge(pd.DataFrame({uncleaned_name: sex, 'pca': [el[0] for el in pca_comp]}))

In [225]:
# sanity check
# titanic_ext.groupby('Sex').agg({'pca': 'mean'})

In [226]:
rf = RandomForestClassifier()

In [227]:
y = titanic_ext['Survived']

In [228]:
X = titanic_ext[['pca']]
rf.fit(X, y)
accuracy_score(y, rf.predict(X))

0.7867564534231201

In [229]:
X = pd.get_dummies(titanic_ext[['Sex']], drop_first=True)
rf.fit(X, y)
accuracy_score(y, rf.predict(X))

0.7867564534231201