In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from random import shuffle
from sklearn import preprocessing
import gc, copy
from sklearn.feature_extraction import FeatureHasher

In [2]:
# Import data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
cols = ["MSZoning", "Street", "LotShape", "LandContour", "SaleType"]
data = pd.concat([df_train[cols], df_test[cols]])

#### Method 1: Encoding to ordinal variables

In [3]:
def encode_to_ordinal_var():
    for c in cols:    
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(data[c].values))
        data[c] = lbl.transform(list(data[c].values))
# encode_to_ordinal_var()
# data

#### Method 2: One hot encoding (or dummy variabales)

In [4]:
# data = pd.get_dummies(data)
# data

#### Method 3: Feature hashing (a.k.a the hashing trick)

In [5]:
def feature_hashing():
    for c in cols:
        data[c] = data[c].astype('str')
        h = FeatureHasher(n_features=5, input_type='string')
        f = h.transform(data[c].values)
        for i in range(5):
            data[c+str(i+1)] = f.toarray()[:,i]
        data.drop(columns=[c], index=1)
# feature_hashing()

#### Method 4: Cat2Vec

In [6]:
def apply_w2v(sentences, model, num_features):
    def _average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        n_words = 0.
        for word in words:
            if word in vocabulary: 
                n_words = n_words + 1.
                feature_vector = np.add(feature_vector, model[word])

        if n_words:
            feature_vector = np.divide(feature_vector, n_words)
        return feature_vector
    
    vocab = set(model.wv.index2word)
    feats = [_average_word_vectors(s, model, vocab, num_features) for s in sentences]
    return np.array(feats)

In [7]:
def gen_cat2vec_sentences(data2):
    X_w2v = copy.deepcopy(data2)
    names = list(X_w2v.columns.values)
    for c in names:
        X_w2v[c] = X_w2v[c].fillna('unknow').astype('category')
        X_w2v[c].cat.categories = ["%s %s" % (c,g) for g in X_w2v[c].cat.categories]
    X_w2v = X_w2v.values.tolist()
    return X_w2v

In [8]:
n_cat2vec_feature  = len(cols) # define the cat2vecs dimentions
n_cat2vec_window   = len(cols)*2 # define the w2v window size

In [9]:
def fit_cat2vec_model():
    X_w2v = gen_cat2vec_sentences(data.loc[:,cols].sample(frac=0.6))
    for i in X_w2v:
        shuffle(i)
    model = Word2Vec(X_w2v, size=n_cat2vec_feature, window=n_cat2vec_window)
    return model

c2v_model = fit_cat2vec_model()

In [10]:
c2v_matrix = apply_w2v(gen_cat2vec_sentences(data.loc[:,cols]), c2v_model, n_cat2vec_feature)
c2v_matrix

  


array([[-0.34983618, -0.10281767,  0.99092593, -0.12472551, -0.73835599],
       [-0.34983618, -0.10281767,  0.99092593, -0.12472551, -0.73835599],
       [-0.34105515, -0.07095265,  0.96772071, -0.13078062, -0.7385518 ],
       ...,
       [-0.34983618, -0.10281767,  0.99092593, -0.12472551, -0.73835599],
       [-0.34983618, -0.10281767,  0.99092593, -0.12472551, -0.73835599],
       [-0.34983618, -0.10281767,  0.99092593, -0.12472551, -0.73835599]])