## ESPERANTO - Machine Learning Project

TEAM: Karolina Wojciechowska, Justyna Krygier, Karol Mularski, Łukasz Rosenkiewicz

DATASET: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

![uci](refs/UCI.jpg)

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn import svm
import sys
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns',10)
dataset = pd.read_csv('jdsz2-esperanto/projekt_ml/Concrete_Data.csv',quotechar='"', decimal=',', skipinitialspace=True)

# Dataset Features

![summary](refs/summary.png)

In [50]:
columns_to_model = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
                    'Coarse Aggregate', 'Fine Aggregate', 'Age']
X = dataset[columns_to_model]
y = dataset['Concrete compressive strength']

In [51]:
for i in columns_to_model:
    fig = plt.figure(figsize=plt.figaspect(0.1))
    plt.scatter(y, X[i], cmap=plt.cm.Set1, edgecolor='c', s=4, label=i)
    plt.title(i)
    plt.xlabel('Concrete compressive strength')
    plt.show()

# Split the dataset - 75 % - 25%

In [52]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=101)

## Create cross validations parameters

In [53]:
my_imputer = Imputer()
train_X = my_imputer.fit_transform(X_train)
test_X = my_imputer.transform(X_test)

kfold = KFold(n_splits=20, random_state=11)

scorer_MAE = make_scorer(mean_absolute_error)
scorer_MSE = make_scorer(mean_squared_error)


###  1 - Linear Regression

In [55]:
regr = linear_model.LinearRegression(normalize=True)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print('----------- Prediction Results -----------')
print('Intercept:', regr.intercept_)
print('Coefficient:', regr.coef_)
print('R^2:', metrics.r2_score(y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Median Absolute Error:', metrics.median_absolute_error(y_test, y_pred))

print()
print('----------- Cross Validation -----------')
results_MAE = cross_val_score(regr, X_train, y_train, cv=kfold, scoring=scorer_MAE)
results_MSE = cross_val_score(regr, X_train, y_train, cv=kfold, scoring=scorer_MSE)

print('Mean_MAE:  {}'.format((results_MAE.mean())))
print('Mean_MSE:  {}'.format((results_MSE.mean())))

----------- Prediction Results -----------
Intercept: 0.6134260453019635
Coefficient: [ 0.11007688  0.09264964  0.07763273 -0.17116918  0.25388462  0.00775027
  0.01209442  0.12341469]
R^2: 0.6101732771570201
Mean Absolute Error: 8.655951386743029
Mean Squared Error: 123.62476413932495
Median Absolute Error: 6.489059453445069

----------- Cross Validation -----------
Mean_MAE:  8.16944236638988
Mean_MSE:  105.84415451783704


###  2 - DecisionTreeRegressor

In [56]:
for max_features in [1,2,3,4,5,6,7,8]:
    dtree = DecisionTreeRegressor(max_features=max_features, random_state=None, max_depth=None,
                                  min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0)
    dtree.fit(X_train, y_train)
    y_pred_tree = dtree.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred_tree)
    r2 = metrics.r2_score(y_test, y_pred_tree)
    print('For max features: {}    Mean Absolute Error:  {}    R^2:  {}'.format(max_features, np.around(mae,decimals=4), r2))

For max features: 1    Mean Absolute Error:  5.9309    R^2:  0.7310543598910371
For max features: 2    Mean Absolute Error:  3.994    R^2:  0.8646517920752604
For max features: 3    Mean Absolute Error:  4.5364    R^2:  0.8519554010611224
For max features: 4    Mean Absolute Error:  4.3782    R^2:  0.8425939148005963
For max features: 5    Mean Absolute Error:  4.7319    R^2:  0.8235968772020658
For max features: 6    Mean Absolute Error:  4.4114    R^2:  0.8508883222769608
For max features: 7    Mean Absolute Error:  4.4832    R^2:  0.8284219953076548
For max features: 8    Mean Absolute Error:  4.4599    R^2:  0.8369044974928705


#### Cross Validation

In [61]:
results_MAE = cross_val_score(dtree, X_train, y_train, cv=kfold, scoring=scorer_MAE)
results_MSE = cross_val_score(dtree, X_train, y_train, cv=kfold, scoring=scorer_MSE)

print('Mean_MAE: {}'.format(results_MAE.mean()))
print('Mean_MSE: {}'.format(results_MSE.mean()))

Mean_MAE: 4.311966767881242
Mean_MSE: 41.254927305161935


### 3 - XGBoost

In [63]:
clf_xgbr = XGBRegressor()
clf_xgbr.fit(X_train, y_train, verbose=False)

y_pred_xgb = clf_xgbr.predict(X_test)
print('Mean Absolute Error: ' + str(mean_absolute_error(y_pred_xgb, y_test)))

Mean Absolute Error: 3.9290020516861315


#### Cross Validation

In [64]:
results_MAE = cross_val_score(clf_xgbr, X_train, y_train, cv=kfold, scoring=scorer_MAE)
results_MSE = cross_val_score(clf_xgbr, X_train, y_train, cv=kfold, scoring=scorer_MSE)

print('Mean_MAE:  {}'.format((results_MAE.mean())))
print('Mean_MSE:  {}'.format((results_MSE.mean())))

Mean_MAE:  3.820051446937838
Mean_MSE:  26.640470626183504


### 4 - SVR

In [66]:
svr_lin = svm.SVR(kernel='linear')
svr_lin.fit(X_train, y_train)
y_lin = svr_lin.predict(X_test)

print('R^2:', metrics.r2_score(y_test, y_lin))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_lin))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_lin))
print('Median Absolute Error:', metrics.median_absolute_error(y_test, y_lin))

R^2: 0.5161990265446725
Mean Squared Error: 153.42658091164702
Mean Absolute Error: 8.689916245057379
Median Absolute Error: 5.960881981768614
