In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import cupy, cudf, cuml

Loading train data from .csv file

In [3]:
path = "../input/house-prices-advanced-regression-techniques/"
train_df = cudf.read_csv(path+"train.csv")

Viewing column names, non-null count and data type

In [4]:
train_df.info()

Viewing top 5 rows

In [5]:
train_df.head()

Selecting columns with float and integer data types

In [6]:
req_dtypes = ['int64', 'float64', 'int32', 'float32']
req_columns = [col for col in train_df.columns if train_df[col].dtype in req_dtypes]

In [7]:
new_train_df = train_df[req_columns]

In [8]:
new_train_df.info()

Checking for null values in each column

In [9]:
new_train_df.isnull().sum()

Filling Null values with median of that column

In [10]:
new_train_df.fillna(new_train_df.median(), inplace=True)

In [11]:
new_train_df.isnull().sum()

In [12]:
X, Y = new_train_df.iloc[:, 1:-1], new_train_df.iloc[:, -1]

In [13]:
X.shape

In [14]:
Y.shape

Checking for moderate to high skewness

In [15]:
req_features = [col for col in X.columns if abs(X[col].skew())>0.5]

In [16]:
req_features

Applying log transformation to fix skewness

In [17]:
for feat in req_features:
    X[feat] = cupy.log1p(X[feat])

Standardizing data

In [18]:
std_scaler = cuml.preprocessing.StandardScaler()

X = std_scaler.fit_transform(X)

Splitting dataset

In [19]:
from cuml.preprocessing import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [20]:
from cuml.metrics.regression import r2_score, mean_absolute_error, mean_squared_error

def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_t = y_test.astype('float64')
    r2, mae, mse = r2_score(y_t, y_pred), mean_absolute_error(y_t, y_pred), mean_squared_error(y_t, y_pred)
    print('R2:', r2)
    print('MAE:', mae)    
    print('MSE:', mse)
    return [r2, mae, mse]

Applying svd, eig, qr, svd-qr, svd-jacobi algorithms

In [21]:
from cuml.linear_model import LinearRegression

algorithms = ["svd", "eig", 'qr', "svd-qr", "svd-jacobi"]

models = []

metrics = []

for algo in algorithms:
    model = LinearRegression(algorithm=algo)
    model.fit(X_train, y_train)
    models.append(model)
    metrics.append(get_metrics(model, X_test, y_test))


In [22]:
comp_table = cudf.DataFrame(metrics, columns=['R2', 'MAE', 'MSE'], index = algorithms)

In [23]:
comp_table

Applying PCA to increase R2 score

In [24]:
from cuml.decomposition import PCA

pca = PCA(n_components=5)

X_pca = pca.fit_transform(X)

X_pca.shape

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=3)

Applying ridge regression

In [26]:
from cuml.linear_model import Ridge

algos = ["svd", "eig"]

pca_nl_models = []

pca_nl_metrics = []

for algo in algos:
    model = Ridge(alpha=1, solver=algo)
    model.fit(X_train, y_train)
    pca_nl_models.append(model)
    pca_nl_metrics.append(get_metrics(model, X_test, y_test))

In [27]:
ridge_stats = cudf.DataFrame(pca_nl_metrics, columns=['R2', 'MAE', 'MSE'], index=['ridge_svd', 'ridge_eig'])
ridge_stats

Applying same preprocessing steps on test data

In [28]:
t_data = cudf.read_csv(path + 'test.csv')

In [29]:
req_columns.remove('SalePrice')

In [30]:
data = t_data[req_columns].iloc[:, 1:]

In [31]:
data.isnull().sum()

In [32]:
data.fillna(data.median(), inplace = True)

In [33]:
skew_features = [col for col in data.columns if abs(data[col].skew()) > 0.5]

In [34]:
for feature in skew_features:
    data[feature] = cupy.log1p(data[feature])

In [35]:
data = pca.fit_transform(data)
print(data.shape)

In [36]:
ridge_model = pca_nl_models[0]
y_pred = ridge_model.predict(data)

In [37]:
predictions = {
    'Id': t_data['Id'].astype('int32'),
    'SalePrice': y_pred,
}
sub = cudf.DataFrame(predictions)
print(sub.info())
sub

In [39]:
sub.to_csv('submission.csv', index=False)