In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017
SHIFT = 5
FOLDS = 5

In [2]:
# Get data
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")

display(train.shape)
display(train.head(2))
display(test.shape)
display(test.head(2))


(4209, 378)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


(4209, 377)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0


In [3]:
# Save test IDs for predictions
test_ids = test.ID

In [4]:
# Remove outliers?

'''# METHOD 1 : remove the one biggest outlier
oldNbRows = train.shape[0]
train = train[train.y < 250]
print("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))
print("train : " + str(train.shape))'''

# METHOD 2 : do nothing


'# METHOD 1 : remove the one biggest outlier\noldNbRows = train.shape[0]\ntrain = train[train.y < 250]\nprint("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))\nprint("train : " + str(train.shape))'

In [5]:
# Handle duplicate rows
cols_to_groupby = [k for k in train.columns if k not in ["ID", "y"]] 
cols_to_apply = cols_to_groupby + ["y"]

'''# METHOD 1 : take mean of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).mean()
train.columns = cols_to_apply'''

'''# METHOD 2 : take median of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).median()
train.columns = cols_to_apply'''

'''# METHOD 3 : keep the row with the lowest y 
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).min()
train.columns = cols_to_apply'''

# METHOD 4 : do nothing

display(train.iloc[10:15])

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
10,31,102.09,h,r,r,f,d,f,h,p,...,0,0,0,0,0,0,0,0,0,0
11,32,98.12,al,r,e,f,d,f,h,o,...,0,0,0,0,0,0,0,0,0,0
12,34,82.62,s,b,ai,c,d,f,g,m,...,0,0,1,0,0,0,0,0,0,0
13,36,94.12,al,r,e,f,d,j,h,o,...,0,0,0,0,0,0,0,0,0,0
14,37,99.15,o,s,as,e,d,j,g,m,...,1,0,0,0,0,0,0,0,0,0


In [6]:
# Add median/mean/min of y for X0-X8

# METHOD 1 
mean_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).mean()
mean_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).mean()
mean_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).mean()
mean_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).mean()
mean_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).mean()
mean_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).mean()
mean_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).mean()
mean_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).mean()

'''# METHOD 2
mean_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).median()
mean_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).median()
mean_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).median()
mean_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).median()
mean_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).median()
mean_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).median()
mean_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).median()
mean_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).median()'''

'''# METHOD 3
mean_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).min()
mean_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).min()
mean_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).min()
mean_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).min()
mean_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).min()
mean_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).min()
mean_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).min()
mean_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).min()'''

mean_x0.columns = ["X0", "mean_x0"]
mean_x1.columns = ["X1", "mean_x1"]
mean_x2.columns = ["X2", "mean_x2"]
mean_x3.columns = ["X3", "mean_x3"]
mean_x4.columns = ["X4", "mean_x4"]
mean_x5.columns = ["X5", "mean_x5"]
mean_x6.columns = ["X6", "mean_x6"]
mean_x8.columns = ["X8", "mean_x8"]
train = pd.merge(train, mean_x0, on = "X0", how = "outer")
train = pd.merge(train, mean_x1, on = "X1", how = "outer")
train = pd.merge(train, mean_x2, on = "X2", how = "outer")
train = pd.merge(train, mean_x3, on = "X3", how = "outer")
train = pd.merge(train, mean_x4, on = "X4", how = "outer")
train = pd.merge(train, mean_x5, on = "X5", how = "outer")
train = pd.merge(train, mean_x6, on = "X6", how = "outer")
train = pd.merge(train, mean_x8, on = "X8", how = "outer")
test = pd.merge(test, mean_x0, on = "X0", how = "left")
test = pd.merge(test, mean_x1, on = "X1", how = "left")
test = pd.merge(test, mean_x2, on = "X2", how = "left")
test = pd.merge(test, mean_x3, on = "X3", how = "left")
test = pd.merge(test, mean_x4, on = "X4", how = "left")
test = pd.merge(test, mean_x5, on = "X5", how = "left")
test = pd.merge(test, mean_x6, on = "X6", how = "left")
test = pd.merge(test, mean_x8, on = "X8", how = "left")

# METHOD 1
test["mean_x0"].fillna(test["mean_x0"].dropna().mean(), inplace = True)
test["mean_x1"].fillna(test["mean_x1"].dropna().mean(), inplace = True)
test["mean_x2"].fillna(test["mean_x2"].dropna().mean(), inplace = True)
test["mean_x3"].fillna(test["mean_x3"].dropna().mean(), inplace = True)
test["mean_x4"].fillna(test["mean_x4"].dropna().mean(), inplace = True)
test["mean_x5"].fillna(test["mean_x5"].dropna().mean(), inplace = True)
test["mean_x6"].fillna(test["mean_x6"].dropna().mean(), inplace = True)
test["mean_x8"].fillna(test["mean_x8"].dropna().mean(), inplace = True)

'''# METHOD 2
test["mean_x0"].fillna(test["mean_x0"].dropna().median(), inplace = True)
test["mean_x1"].fillna(test["mean_x1"].dropna().median(), inplace = True)
test["mean_x2"].fillna(test["mean_x2"].dropna().median(), inplace = True)
test["mean_x3"].fillna(test["mean_x3"].dropna().median(), inplace = True)
test["mean_x4"].fillna(test["mean_x4"].dropna().median(), inplace = True)
test["mean_x5"].fillna(test["mean_x5"].dropna().median(), inplace = True)
test["mean_x6"].fillna(test["mean_x6"].dropna().median(), inplace = True)
test["mean_x8"].fillna(test["mean_x8"].dropna().median(), inplace = True)'''

'''# METHOD 3
test["mean_x0"].fillna(test["mean_x0"].dropna().min(), inplace = True)
test["mean_x1"].fillna(test["mean_x1"].dropna().min(), inplace = True)
test["mean_x2"].fillna(test["mean_x2"].dropna().min(), inplace = True)
test["mean_x3"].fillna(test["mean_x3"].dropna().min(), inplace = True)
test["mean_x4"].fillna(test["mean_x4"].dropna().min(), inplace = True)
test["mean_x5"].fillna(test["mean_x5"].dropna().min(), inplace = True)
test["mean_x6"].fillna(test["mean_x6"].dropna().min(), inplace = True)
test["mean_x8"].fillna(test["mean_x8"].dropna().min(), inplace = True)'''


'# METHOD 3\ntest["mean_x0"].fillna(test["mean_x0"].dropna().min(), inplace = True)\ntest["mean_x1"].fillna(test["mean_x1"].dropna().min(), inplace = True)\ntest["mean_x2"].fillna(test["mean_x2"].dropna().min(), inplace = True)\ntest["mean_x3"].fillna(test["mean_x3"].dropna().min(), inplace = True)\ntest["mean_x4"].fillna(test["mean_x4"].dropna().min(), inplace = True)\ntest["mean_x5"].fillna(test["mean_x5"].dropna().min(), inplace = True)\ntest["mean_x6"].fillna(test["mean_x6"].dropna().min(), inplace = True)\ntest["mean_x8"].fillna(test["mean_x8"].dropna().min(), inplace = True)'

In [7]:
display(test.head(10))

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X384,X385,mean_x0,mean_x1,mean_x2,mean_x3,mean_x4,mean_x5,mean_x6,mean_x8
0,1,az,v,n,f,d,t,a,w,0,...,0,0,78.026,101.413,83.37,96.565,100.658,100.634,97.803,99.134
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,93.725,99.837,94.136,102.507,100.658,100.634,100.306,101.435
2,3,az,v,as,f,d,a,j,j,0,...,0,0,78.026,101.413,103.732,96.565,100.658,100.634,101.165,100.455
3,4,az,l,n,f,d,z,l,n,0,...,0,0,78.026,100.725,83.37,96.565,100.658,100.634,98.946,102.194
4,5,w,s,as,c,d,y,i,m,0,...,0,0,112.614,101.868,103.732,101.959,100.658,88.53,101.227,100.486
5,8,y,aa,ai,e,d,x,g,s,0,...,0,0,94.045,100.203,94.136,100.033,100.658,78.44,100.306,98.916
6,10,x,b,ae,d,d,x,d,y,0,...,0,0,112.555,99.837,97.464,105.114,100.658,78.44,101.346,101.435
7,11,f,s,ae,c,d,h,d,a,0,...,0,0,93.927,101.868,97.464,101.959,100.658,78.02,101.346,100.259
8,12,ap,l,s,c,d,h,j,n,0,...,0,0,116.578,100.725,116.977,101.959,100.658,78.02,101.165,102.194
9,14,o,v,as,f,d,g,f,v,0,...,0,0,94.619,101.413,103.732,96.565,100.658,92.93,94.814,101.943


In [8]:
print("Nb of NA in train : " + str(test.mean_x5.isnull().values.sum()))

Nb of NA in train : 0


In [9]:
# Define target

# METHOD 1 : take y as is
y = pd.DataFrame({"y": train.y}, columns = ["y"])
train.drop("y", axis = 1, inplace = True)

'''# METHOD 2 : apply a transfo on y to make it more "normal"
y = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])
train.drop("y", axis = 1, inplace = True)'''

'# METHOD 2 : apply a transfo on y to make it more "normal"\ny = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])\ntrain.drop("y", axis = 1, inplace = True)'

In [10]:
# Concatenate train and test for global preprocessing
alldata = pd.concat([train, test], axis = 0)

In [11]:
# Remove ID?

'''# METHOD 1 : remove it
alldata.drop("ID", axis = 1, inplace = True)'''

# METHOD 2 : do nothing


'# METHOD 1 : remove it\nalldata.drop("ID", axis = 1, inplace = True)'

In [12]:
# Remove features with no information (unique value in train set) ?

# METHOD 1 : Remove those
constant_vars = []
for col in train.columns:
    if(train[col].nunique() == 1):
        constant_vars.append(col)
print("Removing constant variables : " + str(constant_vars))
alldata.drop(constant_vars, axis = 1, inplace = True)

# METHOD 2 : do nothing


Removing constant variables : ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [13]:
# Handle duplicate columns
def findDuplicateColumns(df) :
    total_cols = df.columns
    removed_cols = []
    for i in range(len(total_cols) - 1) :
        v = df[total_cols[i]].values
        for j in range(i + 1, len(total_cols)):
            if np.array_equal(v, df[total_cols[j]].values) :
                removed_cols.append(total_cols[j])
    return(removed_cols)

train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]

# METHOD 1 : Drop columns which are duplicate in train + test
old_nb_cols = alldata.shape[1]
removed_cols = findDuplicateColumns(alldata)
alldata.drop(removed_cols, axis = 1, inplace = True)
print("Removed " + str(old_nb_cols - alldata.shape[1]) + " duplicate columns")

'''# METHOD 2 : Drop columns which are duplicate just in train
old_nb_cols = train.shape[1]
removed_cols = findDuplicateColumns(train)
alldata.drop(removed_cols, axis = 1, inplace = True)
print("Removed " + str(old_nb_cols - alldata.shape[1]) + " duplicate columns")'''

# METHOD 3 : do nothing



Removed 34 duplicate columns


'# METHOD 2 : Drop columns which are duplicate just in train\nold_nb_cols = train.shape[1]\nremoved_cols = findDuplicateColumns(train)\nalldata.drop(removed_cols, axis = 1, inplace = True)\nprint("Removed " + str(old_nb_cols - alldata.shape[1]) + " duplicate columns")'

In [14]:
# Differentiate numerical features and categorical features
cat_feats = test.select_dtypes(include = ["object"]).columns
print("Categorical features : " + str(len(cat_feats)))
num_feats = test.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(num_feats)))

Categorical features : 8
Numerical features : 365


In [15]:
# Handle cat features

'''# METHOD 1 : convert into dummy variables
alldata = pd.get_dummies(alldata).astype(int)'''

# METHOD 2 : use LabelEncoder (assumes some order in values)
le = LabelEncoder()
for col in cat_feats :
    le.fit(alldata[col]) 
    alldata[col] = le.transform(alldata[col])

'''# METHOD 3 : use LabelEncoder with other order (after a will be b, not aa)
def encodeLetters(charcode) : 
    code = 0
    length = len(str(charcode))
    for i in range(length) :
        # example : AC = 1 * 26 ^ 1 + 3 * 26 ^ 0
        code += (ord(str(charcode)[i]) - ord("a") + 1) * (26 ** (length - i - 1)) - 1
    return(code)

for col in cat_feats :
    alldata[col] = alldata[col].apply(encodeLetters)'''


'# METHOD 3 : use LabelEncoder with other order (after a will be b, not aa)\ndef encodeLetters(charcode) : \n    code = 0\n    length = len(str(charcode))\n    for i in range(length) :\n        # example : AC = 1 * 26 ^ 1 + 3 * 26 ^ 0\n        code += (ord(str(charcode)[i]) - ord("a") + 1) * (26 ** (length - i - 1)) - 1\n    return(code)\n\nfor col in cat_feats :\n    alldata[col] = alldata[col].apply(encodeLetters)'

In [16]:
# Redifferentiate train and test sets
train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]
cols = train.columns

In [17]:
# Add top X PCA components
N_COMP = 8
pca = PCA(n_components = N_COMP, random_state = SEED)
pca_train = pca.fit_transform(train[cols])
pca_test = pca.transform(test[cols])
for i in range(1, N_COMP + 1) :
    train["pca_" + str(i)] = pca_train[:, i - 1]
    test["pca_" + str(i)] = pca_test[:, i - 1]


In [18]:
# Add top X ICA components
N_COMP = 8
ica = FastICA(n_components = N_COMP, random_state = SEED)
ica_train = ica.fit_transform(train[cols])
ica_test = ica.transform(test[cols])
for i in range(1, N_COMP + 1) :
    train["ica_" + str(i)] = ica_train[:, i - 1]
    test["ica_" + str(i)] = ica_test[:, i - 1]


In [19]:
'''# Add top X T-SVD components
N_COMP = 12
tsvd = TruncatedSVD(n_components = N_COMP, random_state = SEED)
tsvd_train = tsvd.fit_transform(train[cols])
tsvd_test = tsvd.transform(test[cols])
for i in range(1, N_COMP + 1) :
    train["tsvd_" + str(i)] = tsvd_train[:, i - 1]
    test["tsvd_" + str(i)] = tsvd_test[:, i - 1]'''


'# Add top X T-SVD components\nN_COMP = 12\ntsvd = TruncatedSVD(n_components = N_COMP, random_state = SEED)\ntsvd_train = tsvd.fit_transform(train[cols])\ntsvd_test = tsvd.transform(test[cols])\nfor i in range(1, N_COMP + 1) :\n    train["tsvd_" + str(i)] = tsvd_train[:, i - 1]\n    test["tsvd_" + str(i)] = tsvd_test[:, i - 1]'

In [20]:
'''# Add top X GaussianRandomProjection components
N_COMP = 12
grp = GaussianRandomProjection(n_components = N_COMP, random_state = SEED)
grp_train = grp.fit_transform(train[cols])
grp_test = grp.transform(test[cols])
for i in range(1, N_COMP + 1) :
    train["grp_" + str(i)] = grp_train[:, i - 1]
    test["grp_" + str(i)] = grp_test[:, i - 1]'''


'# Add top X GaussianRandomProjection components\nN_COMP = 12\ngrp = GaussianRandomProjection(n_components = N_COMP, random_state = SEED)\ngrp_train = grp.fit_transform(train[cols])\ngrp_test = grp.transform(test[cols])\nfor i in range(1, N_COMP + 1) :\n    train["grp_" + str(i)] = grp_train[:, i - 1]\n    test["grp_" + str(i)] = grp_test[:, i - 1]'

In [21]:
# Add top X SparseRandomProjection components
N_COMP = 12
srp = SparseRandomProjection(n_components = N_COMP, random_state = SEED)
srp_train = srp.fit_transform(train[cols])
srp_test = srp.transform(test[cols])
for i in range(1, N_COMP + 1) :
    train["srp_" + str(i)] = srp_train[:, i - 1]
    test["srp_" + str(i)] = srp_test[:, i - 1]


In [22]:
# Add AND and OR operations on top/all binary features

In [23]:
# Add simpler groups of cat features (diminish cardinality)

In [24]:
# Add columns with count of 1s for each binary col?

In [25]:
# Create Validation set for ensembling
X_test = train.sample(frac = (1 / FOLDS), random_state = (SEED * i))
X_train = train.drop(X_test.index, axis = 0)

X_test_y = y.sample(frac = (1 / FOLDS), random_state = (SEED * i))
X_train_y = y.drop(X_test_y.index, axis = 0)


In [26]:
# Write data in CSV files
train.to_csv("clean_data/train" + ".csv", index = False)
y.to_csv("clean_data/y.csv", index = False)

X_train.to_csv("clean_data/X_train" + ".csv", index = False)
X_train_y.to_csv("clean_data/X_train_y.csv", index = False)

X_test.to_csv("clean_data/X_test" + ".csv", index = False)
X_test_y.to_csv("clean_data/X_test_y.csv", index = False)

test.to_csv("clean_data/test" + ".csv", index = False)
test_ids.to_csv("clean_data/test_ids.csv", index = False)
