We one-hot encode the full data and then fit <b>TruncatedSVD</b> from scikit-learn on
sparse matrix with training + validation data. In this way, we reduce the high
dimensional sparse matrix to 120 features and then fit random forest classifier.

In [12]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from scipy import sparse
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import decomposition

In [10]:
df_ = pd.read_csv("input/cat_train_folds.csv")
df_.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
0,436027,0.0,0.0,0.0,T,Y,Blue,Trapezoid,Lion,Russia,...,1.0,Contributor,Boiling Hot,n,S,lS,3.0,8.0,1,0
1,498631,0.0,0.0,0.0,F,Y,Red,Trapezoid,Dog,Costa Rica,...,3.0,Novice,Warm,o,X,iS,2.0,11.0,0,0
2,372327,0.0,0.0,0.0,F,N,Blue,Triangle,Dog,India,...,3.0,Master,Hot,o,N,OZ,2.0,3.0,0,0
3,570356,0.0,0.0,1.0,T,N,Blue,Polygon,Hamster,Russia,...,3.0,Contributor,Lava Hot,h,Y,aA,4.0,3.0,0,0
4,368020,0.0,0.0,0.0,F,Y,Green,Polygon,Hamster,Finland,...,1.0,Novice,Warm,i,M,zc,7.0,11.0,0,0


In [11]:
def run(fold):
    df = pd.read_csv("input/cat_train_folds.csv")
    
    features = [
        f for f in df.columns if f not in ("id","target","kfold")
    ]
    
    for col in features:
        df.loc[:,col] = df[col].astype(str).fillna("NONE")
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    ohe = preprocessing.OneHotEncoder()
    
    full_data = pd.concat(
            [df_train[features], df_valid[features]],
    )
    ohe.fit(full_data[features])
    
    x_train = ohe.transform(df_train[features])
    x_valid = ohe.transform(df_valid[features])
    
    svd = decomposition.TruncatedSVD(n_components=120)
    
    full_sparse = sparse.vstack((x_train,x_valid))
    svd.fit(full_sparse)
    
    x_train = svd.transform(x_train)
    x_valid = svd.transform(x_valid)
    
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(x_train,df_train.target.values)
    
    valid_preds = model.predict_proba(x_valid)[:,1]
    
    auc = metrics.roc_auc_score(df_valid.target.values,valid_preds)
    
    print(f"Fold = {fold}, AUC = {auc}")
    
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

Fold = 0, AUC = 0.7086267182692589
Fold = 1, AUC = 0.7073063594958254
Fold = 2, AUC = 0.7061706539147932
Fold = 3, AUC = 0.7081086600985022
Fold = 4, AUC = 0.7066912838008734
