### Instruction

To obtain the datasets KDD Appetency, Churn and Upselling used for algorithms comparison:

1) Download `orange_small_train.data.zip` file from http://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data and extract the file `orange_small_train.data`. This file contains the features shared between all the three datasets.

2) Download files with labels: 
* `orange_small_train_appetency.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_appetency.labels  
* `orange_small_train_churn.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_churn.labels
* `orange_small_train_upselling.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_upselling.labels

3) Put the files to the same directory as this notebook.

4) Run all the cells of this notebook successively to produce files for training and testing - they will appear in corresponding folders.

In [1]:
resulting_train_filename = "train"
resulting_test_filename = "test"

### Preparing the data

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("./orange_small_train.data", sep = "\t")

In [4]:
data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7,,,,...,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0,,,,...,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7,,,,...,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0,,,,...,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7,,,,...,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,


In [5]:
data.shape

(50000, 230)

### Preparing categorical features

In [6]:
def to_float_str(element):
    try:
        return str(float(element))
    except ValueError:
        return element

In [7]:
categorical_features = { 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 }

In [8]:
for i in categorical_features:
    data[data.columns[i]].fillna("?", inplace=True)
    data[data.columns[i]] = data[data.columns[i]].apply(lambda x: to_float_str(x))

### Preparing numerical features

In [9]:
columns_to_impute = []
for i, column in enumerate(data.columns):
    if i not in categorical_features and pd.isnull(data[column]).any():
        columns_to_impute.append(column)

In [10]:
len(columns_to_impute)

189

In [11]:
for column_name in columns_to_impute:
    data[column_name + "_imputed"] = pd.isnull(data[column_name]).astype(float)
    data[column_name].fillna(0, inplace=True)

In [12]:
for i, column in enumerate(data.columns):
    if i not in categorical_features:
        data[column] = data[column].astype(float)

In [13]:
data.shape

(50000, 419)

### Preparing train/test split

In [14]:
# creating file with features
def prepare_pool(data, labels, filename):
    X = data.values
    y = labels.values
    with open(filename, "w") as fout:
        for i in range(data.shape[0]):
            fout.write(str(y[i]) + "\t" + "\t".join(map(str, X[i])) + "\n")

In [15]:
for dataset in ["appetency", "churn", "upselling"]:
    target = -pd.read_csv("./orange_small_train_" + dataset + ".labels", header=None)[0]
    
    train_idx = pd.read_csv(dataset + "/stratified_train_idx_" + dataset + ".txt", header=None)
    test_idx = pd.read_csv(dataset + "/stratified_test_idx_" + dataset + ".txt", header=None)

    Xtrain = data.iloc[train_idx[0]]
    Ytrain = target.iloc[train_idx[0]]
    Xtest = data.iloc[test_idx[0]]
    Ytest = target.iloc[test_idx[0]]

    prepare_pool(Xtrain, Ytrain, dataset + "/" + resulting_train_filename)
    prepare_pool(Xtest, Ytest, dataset + "/" + resulting_test_filename)
    
    with open(dataset + "/" + resulting_train_filename + '.cd', 'w') as fout:
        fout.write('0\tTarget\n')
        for cat_f_id in sorted(categorical_features):
            fout.write('{}\tCateg\n'.format(cat_f_id + 1))