In [1]:
import numpy as np
import sklearn
import sklearn.model_selection
import sklearn.linear_model
import sklearn.ensemble
import sklearn.decomposition
import sklearn.svm
import matplotlib.pyplot as plt
import sys

In [2]:
# Read data
X_test  = np.genfromtxt('X_test.csv', delimiter=',', skip_header=1)[:, 1:]
X_train = np.genfromtxt('X_train.csv', delimiter=',', skip_header=1)[:, 1:]
Y_train = np.genfromtxt('Y_train.csv', delimiter=',', skip_header=1)[:, 1:]

In [35]:
class_dist = np.bincount(Y_train.astype(int).ravel())
weights = class_dist / class_dist.sum()
weights

array([0.125, 0.75 , 0.125])

In [None]:
assert(len(sys.argv) > 1)

OUTLIER_THRESHOLD = 2
MAX_OUTLIERS_PER_ROW = 20

train_score, val_score = [], []

for state in range(8, 10):
	# Read data
	X_test  = np.genfromtxt('X_test.csv', delimiter=',', skip_header=1)
	X_train = np.genfromtxt('X_train.csv', delimiter=',', skip_header=1)
	Y_train = np.genfromtxt('Y_train.csv', delimiter=',', skip_header=1)

	# Train/test split
	if state < 9:
		X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(X_train, Y_train, 
																				  test_size=.1, 
																				  random_state=state)
	else:
		X_val = X_test.copy()
		Y_val = np.zeros((X_val.shape[0], 2))

	# Scale y
	y_scaler = sklearn.preprocessing.RobustScaler(with_scaling=False, with_centering=False)
	y_scaler.fit([[0],[0]])
	Y_train, Y_val = Y_train[:, 1:], Y_val[:, 1:]
	# Y_train = y_scaler.fit_transform(Y_train[:, 1:])
	# Y_val = y_scaler.transform(Y_val[:, 1:])

	# Scale x
	x_medians = np.nanmedian(X_train, axis=0)
	X_train = np.nan_to_num(X_train - x_medians)
	X_val = np.nan_to_num(X_val - x_medians)

	x_scaler = sklearn.preprocessing.RobustScaler()
	X_train = x_scaler.fit_transform(X_train)
	X_val = x_scaler.transform(X_val)

	# Remove outliers
	X_train_outliers = np.abs(X_train) > OUTLIER_THRESHOLD
	X_train[X_train_outliers] = 0
	X_val[np.abs(X_val) > OUTLIER_THRESHOLD] = 0

	# Remove outlier samples from training set
	X_train = X_train[X_train_outliers.sum(axis=1) <= MAX_OUTLIERS_PER_ROW, :]
	Y_train = Y_train[X_train_outliers.sum(axis=1) <= MAX_OUTLIERS_PER_ROW]

	# assert(np.all(np.abs(X_train) <= OUTLIER_THRESHOLD) and np.all(np.abs(X_val) <= OUTLIER_THRESHOLD))
	assert(not np.any(np.isnan(X_train)) and not np.any(np.isnan(X_val)))

	linreg = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=2, n_estimators=128)
	linreg.fit(X_train[:, 1:], Y_train[:, 0])

	if state < 9:
		train_score.append(sklearn.metrics.r2_score(y_scaler.inverse_transform(Y_train), y_scaler.inverse_transform(linreg.predict(X_train[:, 1:])[:, np.newaxis])))
		val_score.append(sklearn.metrics.r2_score(y_scaler.inverse_transform(Y_val), y_scaler.inverse_transform(linreg.predict(X_val[:, 1:])[:, np.newaxis])))


print('Training set:   %.3f ±%.3f' % (np.mean(train_score), np.var(train_score)))
print('Validation set: %.3f ±%.3f' % (np.mean(val_score), np.var(val_score)))

Y_test = y_scaler.inverse_transform(linreg.predict(X_val[:,1:])[:, np.newaxis])
np.savetxt("Y_test_%s.csv" % sys.argv[1], np.concatenate(( X_test[:, :1], Y_test ), axis=1), 
	delimiter=",", header="id,y", fmt='%.5f', comments='')