### Instruction

To obtain the dataset Kick used for algorithms comparison:

1) Download `training.csv` file from "Don't Get Kicked!" competition on Kaggle: https://www.kaggle.com/c/DontGetKicked/data (you can download .zip archive and extract the file from it).

2) Put it to the same directory as this notebook.

3) Run all the cells of this notebook successively to produce files for training and testing.

In [1]:
resulting_train_filename = "train"
resulting_test_filename = "test"

### Preparing the data

In [2]:
import pandas as pd
import re

In [4]:
data = pd.read_csv("training.csv")

In [5]:
data.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [6]:
target = data["IsBadBuy"].apply(lambda x: 1.0 if x == 0 else -1.0)

In [7]:
data["PurchYear"] = pd.DatetimeIndex(data['PurchDate']).year
data["PurchMonth"] = pd.DatetimeIndex(data['PurchDate']).month
data["PurchDay"] = pd.DatetimeIndex(data['PurchDate']).day
data["PurchWeekday"] = pd.DatetimeIndex(data['PurchDate']).weekday

In [7]:
data.drop(["RefId", "IsBadBuy", "PurchDate"], axis=1, inplace=True)

In [8]:
data.shape

(72983, 35)

### Preparing categorical features

In [9]:
categorical_features = set([0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 23, 24, 25, 26, 27, 29, 31, 32, 33, 34])

In [10]:
def clean_string(s):
    return re.sub('[^A-Za-z0-9]+', "_", str(s))

for i in categorical_features:
    data[data.columns[i]] = data[data.columns[i]].apply(clean_string)

### Preparing numerical features

In [11]:
columns_to_impute = []
for i, column in enumerate(data.columns):
    if i not in categorical_features and pd.isnull(data[column]).any():
        columns_to_impute.append(column)

In [12]:
columns_to_impute

['MMRAcquisitionAuctionAveragePrice',
 'MMRAcquisitionAuctionCleanPrice',
 'MMRAcquisitionRetailAveragePrice',
 'MMRAcquisitonRetailCleanPrice',
 'MMRCurrentAuctionAveragePrice',
 'MMRCurrentAuctionCleanPrice',
 'MMRCurrentRetailAveragePrice',
 'MMRCurrentRetailCleanPrice']

In [13]:
for column_name in columns_to_impute:
    data[column_name + "_imputed"] = pd.isnull(data[column_name]).astype(float)
    data[column_name].fillna(0, inplace=True)

In [14]:
for i, column in enumerate(data.columns):
    if i not in categorical_features:
        data[column] = data[column].astype(float)

In [15]:
data.shape

(72983, 43)

### Preparing train/test split

In [16]:
train_idx = pd.read_csv("stratified_train_idx.txt", header=None)
test_idx = pd.read_csv("stratified_test_idx.txt", header=None)

In [17]:
Xtrain = data.iloc[train_idx[0]]
Ytrain = target.iloc[train_idx[0]]
Xtest = data.iloc[test_idx[0]]
Ytest = target.iloc[test_idx[0]]

In [18]:
# creating file with features
def prepare_pool(data, labels, filename):
    X = data.values
    y = labels.values
    with open(filename, "w") as fout:
        for i in range(data.shape[0]):
            fout.write(str(y[i]) + "\t" + "\t".join(map(str, X[i])) + "\n")

In [19]:
prepare_pool(Xtrain, Ytrain, resulting_train_filename)
prepare_pool(Xtest, Ytest, resulting_test_filename)

In [20]:
with open(resulting_train_filename + '.cd', 'w') as fout:
    fout.write('0\tTarget\n')
    for cat_f_id in sorted(categorical_features):
        fout.write('{}\tCateg\n'.format(cat_f_id + 1))