In [1]:
from category_encoders import TargetEncoder
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import murmurhash3_32

In [2]:
df = pd.read_csv("datasets/titanic_train.csv") # we assume we don't have test set.

In [3]:
np.random.seed(10)
raw_train_data, raw_validation_data = train_test_split(df, test_size=0.3, random_state=0)

In [4]:
all_column_names = set(df.columns)

df_categories = df.select_dtypes('object').astype('category')
category_column_names = set(df_categories.columns)
number_column_names = all_column_names - category_column_names

print(number_column_names)
print(category_column_names)

{'Passenger_Id', 'Number_of_Parents_or_Children', 'Passenger_Fare', 'Age', 'Survived', 'Number_of_Siblings_or_Spouses'}
{'Name', 'Port_of_Embarkation', 'Passenger_Class', 'Sex', 'Cabin', 'Ticket_Number'}


In [5]:
target_column_name = 'Survived'
fields_to_ignore = ["Name", "Passenger_Id"] + [target_column_name]
numeric_features = list(number_column_names - set(fields_to_ignore))
categorical_features = list(category_column_names - set(fields_to_ignore))

In [6]:
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
)

classifier = make_pipeline(preprocessor, LogisticRegression())
classifier.fit(raw_train_data.drop(columns=fields_to_ignore), raw_train_data[target_column_name])
score = classifier.score(raw_validation_data.drop(columns=fields_to_ignore), raw_validation_data[target_column_name])
print("model score: %.3f" % score)

model score: 0.836


## Min Hash Encoding

In [7]:
class MinHashEncoder(BaseEstimator, TransformerMixin):
    """
    minhash method applied to ngram decomposition of strings
    """

    def __init__(self, n_components, ngram_range=(2, 4)):
        self.ngram_range = ngram_range
        self.n_components = n_components

    def get_unique_ngrams(self, string, ngram_range):
        """
        Return a list of different n-grams in a string
        """
        spaces = ' '  # * (n // 2 + n % 2)
        string = spaces + " ".join(string.lower().split()) + spaces
        ngram_list = []
        for n in range(ngram_range[0], ngram_range[1] + 1):
            string_list = [string[i:] for i in range(n)]
            ngram_list += list(set(zip(*string_list)))
        return ngram_list

    def minhash(self, string, n_components, ngram_range):
        min_hashes = np.ones(n_components) * np.infty
        grams = self.get_unique_ngrams(string, self.ngram_range)
        if len(grams) == 0:
            grams = self.get_unique_ngrams(' Na ', self.ngram_range)
        for gram in grams:
            hash_array = np.array([
                murmurhash3_32(''.join(gram), seed=d, positive=True)
                for d in range(n_components)])
            min_hashes = np.minimum(min_hashes, hash_array)
        return min_hashes/(2**32-1)

    def fit(self, X, y=None):
        self.hash_dict = {}
        for i, x in enumerate(X):
            if x not in self.hash_dict:
                self.hash_dict[x] = self.minhash(
                    x, n_components=self.n_components,
                    ngram_range=self.ngram_range)
        return self

    def transform(self, X):

        X_out = np.zeros((len(X), self.n_components))

        for i, x in enumerate(X):
            if x not in self.hash_dict:
                self.hash_dict[x] = self.minhash(
                    x, n_components=self.n_components,
                    ngram_range=self.ngram_range)

        for i, x in enumerate(X):
            X_out[i, :] = self.hash_dict[x]
        return X_out


In [8]:
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())

if True:
    minhash_transformer = make_column_transformer(
        *(
            (MinHashEncoder(20), i)
            for i
            in range(len(categorical_features))
        )
    )
else:
    minhash_transformer = make_column_transformer(
        (MinHashEncoder(28), 2), # Ticket
        (MinHashEncoder(2), 0), # Port_of_Embarkation
        (MinHashEncoder(1), 3), # Sex
        (MinHashEncoder(20), 1), # Cabin
    )
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='_'),
    minhash_transformer,
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
)

classifier = make_pipeline(preprocessor, LogisticRegression())
classifier.fit(raw_train_data.drop(columns=fields_to_ignore), raw_train_data[target_column_name])
score = classifier.score(raw_validation_data.drop(columns=fields_to_ignore), raw_validation_data[target_column_name])
print("model score: %.3f" % score)

model score: 0.836


## Target Encoding

In [9]:
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (TargetEncoder(), categorical_features),
)

classifier = make_pipeline(preprocessor, LogisticRegression())
classifier.fit(raw_train_data.drop(columns=fields_to_ignore), raw_train_data[target_column_name])
score = classifier.score(raw_validation_data.drop(columns=fields_to_ignore), raw_validation_data[target_column_name])
print("model score: %.3f" % score)

model score: 0.802
