In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 420
SHIFT = 5
FOLDS = 10

In [2]:
# Get data
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")

display(train.shape)
display(train.head(2))
display(test.shape)
display(test.head(2))

# Save test IDs for predictions
test_ids = test.ID

(4209, 378)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


(4209, 377)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0


In [3]:
# Remove outliers?

'''# METHOD 1 : remove the one biggest outlier
oldNbRows = train.shape[0]
train = train[train.y < 250]
print("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))
print("train : " + str(train.shape))'''

# METHOD 2 : do nothing


'# METHOD 1 : remove the one biggest outlier\noldNbRows = train.shape[0]\ntrain = train[train.y < 250]\nprint("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))\nprint("train : " + str(train.shape))'

In [4]:
# Define target

# METHOD 1 : take y as is
y = pd.DataFrame({"y": train.y})

'''# METHOD 2 : apply a transfo on y to make it more "normal"
y = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])'''

'# METHOD 2 : apply a transfo on y to make it more "normal"\ny = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])'

In [5]:
# Handle cat features

'''# METHOD 1 : convert into dummy variables
alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
alldata = pd.get_dummies(alldata).astype(int)
train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]'''

'''# METHOD 2 : use LabelEncoder (assumes some order in values)
for c in train.drop("y", axis = 1).columns:
    if train[c].dtype == "object" :
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))'''

# METHOD 3 : use LabelEncoder with other order (after a will be b, not aa)
def encodeLetters(charcode) : 
    code = 0
    length = len(str(charcode))
    for i in range(length) :
        # example : AC = 1 * 26 ^ 1 + 3 * 26 ^ 0
        code += (ord(str(charcode)[i]) - ord("a") + 1) * (26 ** (length - i - 1)) - 1
    return(code)

for c in train.drop("y", axis = 1).columns:
    if train[c].dtype == "object" :
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = train[c].apply(encodeLetters)
        test[c] = test[c].apply(encodeLetters)


In [6]:
# Remove features with no information (unique value in train set) ?

# METHOD 1 : Remove those
constant_vars = []
for col in train.drop("y", axis = 1).columns:
    if(train[col].nunique() == 1):
        constant_vars.append(col)
print("Removing constant variables : " + str(constant_vars))
train.drop(constant_vars, axis = 1, inplace = True)
test.drop(constant_vars, axis = 1, inplace = True)

'''# METHOD 2 : do nothing'''


Removing constant variables : ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


'# METHOD 2 : do nothing'

In [7]:
# Handle duplicate variables
def findDuplicateVars(df) :
    cols = df.columns
    removed_cols = []
    for i in range(len(cols) - 1) :
        v = df[cols[i]].values
        for j in range(i + 1, len(cols)):
            if np.array_equal(v, df[cols[j]].values) :
                #print("Dups : " + str(cols[i]) + " and " + str(cols[j]))
                removed_cols.append(cols[j])
    return(removed_cols)


# METHOD 1 : Drop variables which are duplicate in train + test
old_nb_vars = train.shape[1]
alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
removed_vars = findDuplicateVars(alldata)
removed_vars = list(set(removed_vars))
train.drop(removed_vars, axis = 1, inplace = True)
test.drop(removed_vars, axis = 1, inplace = True)
print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
print(sorted(removed_vars))

'''# METHOD 2 : Drop variables which are duplicate just in train
old_nb_vars = train.shape[1]
removed_vars = findDuplicateVars(train)
removed_vars = list(set(removed_vars))
train.drop(removed_vars, axis = 1, inplace = True)
test.drop(removed_vars, axis = 1, inplace = True)
print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
print(sorted(removed_vars))'''

'''# METHOD 3 : do nothing'''



Removed 34 duplicate variables
['X102', 'X113', 'X119', 'X134', 'X146', 'X147', 'X172', 'X199', 'X213', 'X214', 'X216', 'X222', 'X226', 'X227', 'X239', 'X244', 'X253', 'X254', 'X262', 'X279', 'X296', 'X299', 'X302', 'X324', 'X326', 'X35', 'X360', 'X364', 'X37', 'X382', 'X385', 'X39', 'X76', 'X84']


'# METHOD 3 : do nothing'

In [8]:
# Add columns with count of 1s for each binary col
binary_vars = list(set(train.columns.drop(["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])))
train["bin_ones"] = (train[binary_vars] == 1).astype(int).sum(axis = 1)
test["bin_ones"] = (test[binary_vars] == 1).astype(int).sum(axis = 1)

In [9]:
# Add top X PCA components
N_COMP = 12
pca = PCA(n_components = N_COMP, random_state = SEED)
pca_train = pca.fit_transform(train.drop("y", axis = 1))
pca_test = pca.transform(test)

# Add top X ICA components
N_COMP = 12
ica = FastICA(n_components = N_COMP, random_state = SEED)
ica_train = ica.fit_transform(train.drop("y", axis = 1))
ica_test = ica.transform(test)

# Add top X T-SVD components
N_COMP = 12
tsvd = TruncatedSVD(n_components = N_COMP, random_state = SEED)
tsvd_train = tsvd.fit_transform(train.drop("y", axis = 1))
tsvd_test = tsvd.transform(test)

# Add top X GaussianRandomProjection components
N_COMP = 12
grp = GaussianRandomProjection(n_components = N_COMP, random_state = SEED)
grp_train = grp.fit_transform(train.drop("y", axis = 1))
grp_test = grp.transform(test)

# Add top X SparseRandomProjection components
N_COMP = 12
srp = SparseRandomProjection(n_components = N_COMP, random_state = SEED)
srp_train = srp.fit_transform(train.drop("y", axis = 1))
srp_test = srp.transform(test)

for i in range(1, N_COMP + 1) :
    train["pca_" + str(i)] = pca_train[:, i - 1]
    test["pca_" + str(i)] = pca_test[:, i - 1]
    
    train["ica_" + str(i)] = ica_train[:, i - 1]
    test["ica_" + str(i)] = ica_test[:, i - 1]
    
    train["tsvd_" + str(i)] = tsvd_train[:, i - 1]
    test["tsvd_" + str(i)] = tsvd_test[:, i - 1]
    
    train["grp_" + str(i)] = grp_train[:, i - 1]
    test["grp_" + str(i)] = grp_test[:, i - 1]
    
    train["srp_" + str(i)] = srp_train[:, i - 1]
    test["srp_" + str(i)] = srp_test[:, i - 1]


In [10]:
# Add median/mean/min of y for X0-X8

'''# METHOD 1 
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).mean()
y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).mean()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).mean()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).mean()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).mean()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).mean()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).mean()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).mean()'''

'''# METHOD 2
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).median()
y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).median()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).median()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).median()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).median()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).median()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).median()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).median()'''

'''# METHOD 3
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).min()
y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).min()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).min()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).min()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).min()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).min()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).min()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).min()'''

'''y_x0.columns = ["X0", "y_x0"]
y_x1.columns = ["X1", "y_x1"]
y_x2.columns = ["X2", "y_x2"]
y_x3.columns = ["X3", "y_x3"]
y_x4.columns = ["X4", "y_x4"]
y_x5.columns = ["X5", "y_x5"]
y_x6.columns = ["X6", "y_x6"]
y_x8.columns = ["X8", "y_x8"]
train = pd.merge(train, y_x0, on = "X0", how = "outer")
train = pd.merge(train, y_x1, on = "X1", how = "outer")
train = pd.merge(train, y_x2, on = "X2", how = "outer")
train = pd.merge(train, y_x3, on = "X3", how = "outer")
train = pd.merge(train, y_x4, on = "X4", how = "outer")
train = pd.merge(train, y_x5, on = "X5", how = "outer")
train = pd.merge(train, y_x6, on = "X6", how = "outer")
train = pd.merge(train, y_x8, on = "X8", how = "outer")
test = pd.merge(test, y_x0, on = "X0", how = "left")
test = pd.merge(test, y_x1, on = "X1", how = "left")
test = pd.merge(test, y_x2, on = "X2", how = "left")
test = pd.merge(test, y_x3, on = "X3", how = "left")
test = pd.merge(test, y_x4, on = "X4", how = "left")
test = pd.merge(test, y_x5, on = "X5", how = "left")
test = pd.merge(test, y_x6, on = "X6", how = "left")
test = pd.merge(test, y_x8, on = "X8", how = "left")'''

'''# METHOD 1
test["y_x0"].fillna(test["y_x0"].dropna().mean(), inplace = True)
test["y_x1"].fillna(test["y_x1"].dropna().mean(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().mean(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().mean(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().mean(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().mean(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().mean(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().mean(), inplace = True)'''

'''# METHOD 2
test["y_x0"].fillna(test["y_x0"].dropna().median(), inplace = True)
test["y_x1"].fillna(test["y_x1"].dropna().median(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().median(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().median(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().median(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().median(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().median(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().median(), inplace = True)'''

'''# METHOD 3
test["y_x0"].fillna(test["y_x0"].dropna().min(), inplace = True)
test["y_x1"].fillna(test["y_x1"].dropna().min(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().min(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().min(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().min(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().min(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().min(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().min(), inplace = True)'''


'# METHOD 3\ntest["y_x0"].fillna(test["y_x0"].dropna().min(), inplace = True)\ntest["y_x1"].fillna(test["y_x1"].dropna().min(), inplace = True)\ntest["y_x2"].fillna(test["y_x2"].dropna().min(), inplace = True)\ntest["y_x3"].fillna(test["y_x3"].dropna().min(), inplace = True)\ntest["y_x4"].fillna(test["y_x4"].dropna().min(), inplace = True)\ntest["y_x5"].fillna(test["y_x5"].dropna().min(), inplace = True)\ntest["y_x6"].fillna(test["y_x6"].dropna().min(), inplace = True)\ntest["y_x8"].fillna(test["y_x8"].dropna().min(), inplace = True)'

In [11]:
# Handle duplicate rows
'''cols_to_groupby = [k for k in train.columns if k not in ["ID", "y"]] 
cols_to_apply = cols_to_groupby + ["y"]'''

'''# METHOD 1 : take mean of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).mean()
train.columns = cols_to_apply'''

'''# METHOD 2 : take median of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).median()
train.columns = cols_to_apply'''

'''# METHOD 3 : keep the row with the lowest y 
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).min()
train.columns = cols_to_apply'''

# METHOD 4 : do nothing


'# METHOD 3 : keep the row with the lowest y \ntrain = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).min()\ntrain.columns = cols_to_apply'

In [12]:
'''# Concatenate train and test for global preprocessing
alldata = pd.concat([train, test], axis = 0)'''

'# Concatenate train and test for global preprocessing\nalldata = pd.concat([train, test], axis = 0)'

In [13]:
'''# Differentiate numerical features and categorical features
cat_feats = test.select_dtypes(include = ["object"]).columns
print("Categorical features : " + str(len(cat_feats)))
num_feats = test.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(num_feats)))'''

'# Differentiate numerical features and categorical features\ncat_feats = test.select_dtypes(include = ["object"]).columns\nprint("Categorical features : " + str(len(cat_feats)))\nnum_feats = test.select_dtypes(exclude = ["object"]).columns\nprint("Numerical features : " + str(len(num_feats)))'

In [14]:
'''# Redifferentiate train and test sets
train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]
cols = train.columns'''

'# Redifferentiate train and test sets\ntrain = alldata.iloc[:train.shape[0], :]\ntest = alldata.iloc[train.shape[0]: , :]\ncols = train.columns'

In [15]:
# Add AND and OR operations on top/all binary features

In [16]:
# Add simpler groups of cat features (diminish cardinality)

In [17]:
# Create Validation set for ensembling
X_test = train.drop("y", axis = 1).sample(frac = (1 / FOLDS), random_state = SEED)
X_train = train.drop("y", axis = 1).drop(X_test.index, axis = 0)

X_test_y = y.sample(frac = (1 / FOLDS), random_state = SEED)
X_train_y = y.drop(X_test_y.index, axis = 0)


In [18]:
# Write data in CSV files
train.drop("y", axis = 1, inplace = True)

train.to_csv("clean_data/train" + ".csv", index = False)
y.to_csv("clean_data/y.csv", index = False)

X_train.to_csv("clean_data/X_train" + ".csv", index = False)
X_train_y.to_csv("clean_data/X_train_y.csv", index = False)

X_test.to_csv("clean_data/X_test" + ".csv", index = False)
X_test_y.to_csv("clean_data/X_test_y.csv", index = False)

test.to_csv("clean_data/test" + ".csv", index = False)
test_ids.to_csv("clean_data/test_ids.csv", index = False)
