# Create train, validation and test set for libffm


https://www.kaggle.com/scirpus/libffm-generator-lb-280

In [1]:
import math
import numpy as np
import pandas as pd
from numba import jit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    """Gini Evaluation metric

    Score Gini for give True target and predicted target values
    
    
    Arguments:
        y_true {np.array} -- True target values
        y_prob {np.array} -- Predicted target values

    Returns:
        gini {float} -- calculated gini sccore
    """
    
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def stratify_split(df, target_col, split_size, cols_to_drop=None):
    """splits the data
    
    splits the data into the given ratio and drops the given columns in list
    
    Arguments:
        df {pd.dataFrame} -- dataframe object to split
        target_col {string} -- name if the target column
        split_size {float} -- split size in range 
    
    Keyword Arguments:
        cols_to_drop {list} -- list of columns to drop (default: {None})
    """
    
    cols_to_drop = None if cols_to_drop is None else cols_to_drop
    if cols_to_drop is not None:
        df = df.drop(cols_to_drop, axis=1)
        
    X = df.loc[:, df.columns != target_col]
    y = df[target_col]
    

    #sklearn stratify split
    sss = StratifiedShuffleSplit(test_size=split_size, random_state=1001)
    for train_index, test_index in sss.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

    return (X_train, y_train, X_test, y_test)

In [3]:
TRAIN_PATH = 'porto_data/train.csv'
TEST_PATH = 'porto_data/test.csv'

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [None]:
test.insert(1,'target',0)
x = pd.concat([train,test])
x = x.reset_index(drop=True)
unwanted = x.columns[x.columns.str.startswith('ps_calc_')]
x.drop(unwanted,inplace=True,axis=1)


features = x.columns[2:]
categories = []
for c in features:
    trainno = len(x.loc[:train.shape[0],c].unique())
    testno = len(x.loc[train.shape[0]:,c].unique())
    print c,trainno,testno
    
    
x.loc[:,'ps_reg_03'] = pd.cut(x['ps_reg_03'], 50,labels=False)
x.loc[:,'ps_car_12'] = pd.cut(x['ps_car_12'], 50,labels=False)
x.loc[:,'ps_car_13'] = pd.cut(x['ps_car_13'], 50,labels=False)
x.loc[:,'ps_car_14'] =  pd.cut(x['ps_car_14'], 50,labels=False)
x.loc[:,'ps_car_15'] =  pd.cut(x['ps_car_15'], 50,labels=False)



test = x.loc[train.shape[0]:].copy()
train = x.loc[:train.shape[0]].copy()


X_train, y_train, X_val, y_val = stratify_split(train, 'target', 0.2)

In [None]:
train = pd.concat([y_train, X_train], axis=1)
val = pd.concat([y_val, X_val], axis=1)

train.drop('id',inplace=True,axis=1)
val.drop('id',inplace=True,axis=1)
test.drop('id',inplace=True,axis=1)

categories = train.columns[1:]
numerics = []



In [None]:
currentcode = len(numerics)
catdict = {}
catcodes = {}

for x in numerics:
    catdict[x] = 0
for x in categories:
    catdict[x] = 1

#train
noofrows = train.shape[0]
noofcolumns = len(features)
with open("alltrainffm_2.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = train.iloc[r].to_dict()
        datastring += str(int(datarow['target']))


        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)

#val
noofrows = val.shape[0]
noofcolumns = len(features)
with open("allvalffm_2.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = val.iloc[r].to_dict()
        datastring += str(int(datarow['target']))


        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)

        
#test
noofrows = test.shape[0]
noofcolumns = len(features)
with open("alltestffm_2.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = test.iloc[r].to_dict()
        datastring += str(int(datarow['target']))


        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)
        
print "DONE"