In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import FastICA

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

## Facebook Comment Volume Dataset

In [11]:
df = pd.read_csv('datasets/facebook_comments.csv', index_col=0)

In [12]:
print('dataset size: {}'.format(df.shape))

dataset size: (40949, 54)


In [13]:
# encoding categorical variables
# H Local - category
# Post Promotion Status - category
# Base Time - time variable
# Page Category - category

H_Local = OneHotEncoder().fit_transform(df['H Local'].values.reshape(-1, 1)).todense()
Post_Promotion_Status = OneHotEncoder().fit_transform(df['Post Promotion Status'].values.reshape(-1, 1)).todense()
Base_Time = OneHotEncoder().fit_transform(df['Base Time'].values.reshape(-1, 1)).todense()
Page_Category = OneHotEncoder().fit_transform(df['Page Category'].values.reshape(-1, 1)).todense()

In [14]:
y = df.Target.values.reshape(-1, 1)
X = df.drop(['H Local', 'Post Promotion Status', 'Base Time', 'Page Category', 'Target'], axis=1).values
X = np.hstack([X, H_Local, Post_Promotion_Status, Base_Time, Page_Category])

In [15]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (40949, 228)


In [16]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X)

In [17]:
X, y = shuffle(X, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [18]:
print('for pure data:')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data:
linear, MSE: 804.4127647110759
forest, MSE: 674.5280983831519


In [19]:
X, y = shuffle(X_ica, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [20]:
print('for decorrelated data using ICA')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using ICA
linear, MSE: 1260.2863077124107
forest, MSE: 1265.4528301251357


## Parkinsons Telemonitoring Data Set

In [21]:
df = pd.read_csv('datasets/parkinsons_updrs.data')

In [22]:
print('dataset size: {}'.format(df.shape))

dataset size: (5875, 22)


In [23]:
subject = df['subject#'].unique().tolist()
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, int(i['subject#']) - 1] = 1

In [24]:
# replacing categorical features with binary values
y = df.total_UPDRS
X = df.drop(['motor_UPDRS', 'total_UPDRS', 'subject#'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [25]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (5875, 61)


In [26]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X)

In [27]:
X, y = shuffle(X, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [28]:
print('for pure data:')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data:
linear, MSE: 116.36315466704814
forest, MSE: 115.42544736311554


In [29]:
X, y = shuffle(X_ica, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [30]:
print('for decorrelated data using ICA')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using ICA
linear, MSE: 114.74585618538356
forest, MSE: 115.05177890203626


## Energy efficiency Data Set

In [31]:
df = pd.read_excel('./datasets/ENB2012_data.xlsx')

In [32]:
print('dataset size: {}'.format(df.shape))

dataset size: (768, 10)


In [33]:
subject = df['X6'].unique().tolist()
subject_map = dict(zip(subject, range(len(subject))))
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, subject_map[i['X6']]] = 1

In [34]:
# replacing categorical features with binary values
y = df.Y1
X = df.drop(['Y1', 'Y2', 'X6'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [35]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (768, 11)


In [36]:
# number of components is fixed to 4
ica = FastICA(n_components=4, tol=0.01)
X_ica = ica.fit_transform(X)

In [37]:
X, y = shuffle(X, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [38]:
print('for pure data:')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data:
linear, MSE: 104.3818930842221
forest, MSE: 108.46182680569012


In [39]:
X, y = shuffle(X_ica, y)

models = {
    'linear': LinearRegression(),
    'forest': LGBMRegressor()
}

stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        
        model.fit(X_train, y_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_test), y_test))
        

In [40]:
print('for decorrelated data using ICA')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using ICA
linear, MSE: 101.70438907482556
forest, MSE: 105.52809637090914
