## ESPERANTO - Machine Learning Project

TEAM: Karolina Wojciechowska, Justyna Krygier, Karol Mularski, Łukasz Rosenkiewicz

DATASET: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

![uci](refs/UCI.jpg)

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, make_scorer, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn import svm
import warnings
warnings.filterwarnings("ignore")

# Dataset Features

![summary](refs/summary.png)

In [18]:
dataset = pd.read_csv('jdsz2-esperanto/projekt_ml/Concrete_Data.csv',quotechar='"', decimal=',', skipinitialspace=True)
columns_to_model = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
                    'Coarse Aggregate', 'Fine Aggregate', 'Age']
X = dataset[columns_to_model]
y = dataset['Concrete compressive strength']

In [19]:
for i in columns_to_model:
    fig = plt.figure(figsize=plt.figaspect(0.1))
    plt.scatter(y, X[i], cmap=plt.cm.Set1, edgecolor='c', s=4, label=i)
    plt.title(i)
    plt.xlabel('Concrete compressive strength')
    plt.show()

## Split the dataset - 75 % - 25%

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=101)

## Create Mean Absolute Percentage Error function

In [21]:
def mean_absolute_percentage_error(y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

## Regression Models

In [22]:
## LINEAR REGRESSION
regr = linear_model.LinearRegression(normalize=True)
regr.fit(X_train, y_train)
y_pred_lin = regr.predict(X_test)

## XGBoost
clf_xgbr = XGBRegressor()
clf_xgbr.fit(X_train, y_train, verbose=False)
y_pred_xgb = clf_xgbr.predict(X_test)

## SVR - 2 kernels
svr_lin = svm.SVR(kernel='linear')
svr_lin.fit(X_train, y_train)
y_pred_svrlin = svr_lin.predict(X_test)

svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_rbf.fit(X_train, y_train)
y_pred_svrrbf = svr_rbf.predict(X_test)

## Decision Tree
for max_features in [1,2,3,4,5,6,7,8]:
    dtree = DecisionTreeRegressor(max_features=max_features, random_state=None, max_depth=None,
                                  min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0)
    dtree.fit(X_train, y_train)
    y_pred_tree = dtree.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred_tree)
    r2 = metrics.r2_score(y_test, y_pred_tree)
 #   print('For max features: {}    Mean Absolute Error:  {}    R^2:  {}'.format(max_features, np.around(mae,decimals=4),r2))


### Prediction Results

In [23]:
print('R^2_RL          :  ', metrics.r2_score(y_test, y_pred_lin))
print('R^2_XGb       :  ', metrics.r2_score(y_test, y_pred_xgb))
print('R^2_SVRLin   :  ', metrics.r2_score(y_test, y_pred_svrlin))
print('R^2_SVRrbf   :  ', metrics.r2_score(y_test, y_pred_svrrbf))
print('R^2_DTree     :  ', metrics.r2_score(y_test, y_pred_tree))
print()
print('Mean Absolute Error_RL          :  ', metrics.mean_absolute_error(y_test, y_pred_lin))
print('Mean Absolute Error_XGb       :  ', metrics.mean_absolute_error(y_test, y_pred_xgb))
print('Mean Absolute Error_SVRlin    :  ', metrics.mean_absolute_error(y_test, y_pred_svrlin))
print('Mean Absolute Error_SVRrbf   :  ', metrics.mean_absolute_error(y_test, y_pred_svrrbf))
print('Mean Absolute Error_DTree    :  ', metrics.mean_absolute_error(y_test, y_pred_tree))
print()
print('Mean Absolute % Error_RL          :  ', mean_absolute_percentage_error(y_test, y_pred_lin))
print('Mean Absolute % Error_XGb       :  ', mean_absolute_percentage_error(y_test, y_pred_xgb))
print('Mean Absolute % Error_SVRlin   :  ', mean_absolute_percentage_error(y_test, y_pred_svrlin))
print('Mean Absolute % Error_SVRrbf   :  ', mean_absolute_percentage_error(y_test, y_pred_svrrbf))
print('Mean Absolute % Error_DTree    :  ', mean_absolute_percentage_error(y_test, y_pred_tree))

R^2_RL          :   0.6101732771570201
R^2_XGb       :   0.9063913725938589
R^2_SVRLin   :   0.5161990265446725
R^2_SVRrbf   :   0.292472290460113
R^2_DTree     :   0.8352655140924142

Mean Absolute Error_RL          :   8.655951386743029
Mean Absolute Error_XGb       :   3.9290020516861315
Mean Absolute Error_SVRlin    :   8.689916245057379
Mean Absolute Error_SVRrbf   :   10.977198973850978
Mean Absolute Error_DTree    :   4.517228682170542

Mean Absolute % Error_RL          :   31.73687243052431
Mean Absolute % Error_XGb       :   12.633529968384455
Mean Absolute % Error_SVRlin   :   30.762259295344997
Mean Absolute % Error_SVRrbf   :   47.1005207910189
Mean Absolute % Error_DTree    :   14.436364696144603


## Cross validation

### Parameters

In [24]:
my_imputer = Imputer()
train_X = my_imputer.fit_transform(X_train)
test_X = my_imputer.transform(X_test)

kfold = KFold(n_splits=20, random_state=11)

scorer_MAE = make_scorer(mean_absolute_error)
scorer_MAPE = make_scorer(mean_absolute_percentage_error)

### Cross Validation Results

##### --- Mean Absolute Error ---

In [None]:
# results_MAE_regr   = cross_val_score(regr, X_train, y_train, cv=kfold, scoring=scorer_MAE)
# results_MAE_XGb    = cross_val_score(clf_xgbr, X_train, y_train, cv=kfold, scoring=scorer_MAE)
# results_MAE_SVRlin = cross_val_score(svr_lin, X_train, y_train, cv=kfold, scoring=scorer_MAE)
# results_MAE_SVRrbf = cross_val_score(svr_rbf, X_train, y_train, cv=kfold, scoring=scorer_MAE)
# results_MAE_DTree  = cross_val_score(dtree, X_train, y_train, cv=kfold, scoring=scorer_MAE)

In [None]:
# print('Mean_MAE_RL : {}'.format(results_MAE_regr.mean()))
# print('Mean_MAE_XGb : {}'.format(results_MAE_XGb.mean()))
# print('Mean_MAE_SVRlin : {}'.format(results_MAE_SVRlin.mean()))
# print('Mean_MAE_SVRrbf : {}'.format(results_MAE_SVRrbf.mean()))
# print('Mean_MAE_DTree : {}'.format(results_MAE_DTree.mean()))

![cross_val_mae](refs/cross_val_mae.png)

##### --- Mean Absolute Percentage Error ---

In [None]:
# results_MAPE_regr = cross_val_score(regr, X_train, y_train, cv=kfold, scoring=scorer_MAPE)
# results_MAPE_XGb = cross_val_score(clf_xgbr, X_train, y_train, cv=kfold, scoring=scorer_MAPE)
# results_MAPE_SVRlin = cross_val_score(svr_lin, X_train, y_train, cv=kfold, scoring=scorer_MAPE)
# results_MAPE_SVRrbf = cross_val_score(svr_rbf, X_train, y_train, cv=kfold, scoring=scorer_MAPE)
# results_MAPE_DTree = cross_val_score(dtree, X_train, y_train, cv=kfold, scoring=scorer_MAPE)

In [None]:
# print('Mean_MAPE_RL : {}'.format(results_MAPE_regr.mean()))
# print('Mean_MAPE_XGb    : {}'.format(results_MAPE_XGb.mean()))
# print('Mean_MAPE_SVRlin : {}'.format(results_MAPE_SVRlin.mean()))
# print('Mean_MAPE_SVRrbf : {}'.format(results_MAPE_SVRrbf.mean()))
# print('Mean_MAPE_DTree : {}'.format(results_MAPE_DTree.mean()))

![cross_val_mape](refs/cross_val_mape.png)