<a href="https://colab.research.google.com/github/kareem1925/Classical-and-quantum-regression-analysis-for-the-optoelectronic-performance-of-NTCDA/blob/master/TPOT%20regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install tpot==0.11.5 xgboost==0.90 -q

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score
import warnings
np.random.seed(0)
np.seterr(all='warn')
warnings.filterwarnings('ignore')

In [0]:
Data=pd.read_csv('https://raw.githubusercontent.com/kareem1925/Classical-and-quantum-regression-analysis-for-the-optoelectronic-performance-of-NTCDA/master/data.csv')
features = Data.drop('I', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    Data['I'].values,
                                                    test_size=0.15,
                                                    random_state=0,
                                                    shuffle=True)

# Average CV score on the training set was:-5.3755355985260195e-11
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        StackingEstimator(estimator=XGBRegressor(objective='reg:squarederror',learning_rate=1.0, max_depth=6, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.6500000000000001))
    ),
    KNeighborsRegressor(algorithm="brute", metric="minkowski", n_neighbors=2, p=5, weights="distance")
)


exported_pipeline.fit(X_train, y_train)
result = exported_pipeline.predict(X_train)
result_T = exported_pipeline.predict(X_test)

In [0]:
print('R^2 score = ', r2(y_test,exported_pipeline.predict(X_test)))

R^2 score =  0.9994390444178382


In [0]:
print('mean squared error on testing data = ', mse(y_test,exported_pipeline.predict(X_test)))

mean squared error on testing data =  3.6579438340338025e-11


In [0]:
print('mean absolute error on testing data = ', mae(y_test,exported_pipeline.predict(X_test)))

mean absolute error on testing data =  2.674629296e-06


In [0]:
scores = cross_val_score(X=X_train,y=y_train,estimator=exported_pipeline,cv=13,scoring='neg_mean_squared_error')

In [0]:
abs(scores.mean())

6.434502069638828e-11

In [0]:
config = {

    'sklearn.linear_model.ElasticNetCV': {
        'l1_ratio': np.arange(0.0, 1.01, 0.05),
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },

    'sklearn.ensemble.ExtraTreesRegressor': {
        'n_estimators': [100],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    },

    'sklearn.ensemble.AdaBoostRegressor': {
        'n_estimators': [100],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'loss': ["linear", "square", "exponential"]
    },

    'sklearn.tree.DecisionTreeRegressor': {
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },

    'sklearn.neighbors.KNeighborsRegressor': {
        'n_neighbors': range(1, 10),
        'weights': ["uniform", "distance"],
        'p': [1, 2,3,4,5],
        'algorithm':['kd_tree','ball_tree','brute'],
        'metric':['manhattan',"chebyshev",'euclidean','minkowski']
    },

    'sklearn.linear_model.LassoLarsCV': {
        'normalize': [True, False]
    },

    'sklearn.svm.LinearSVR': {
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1.]
    },

    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    'sklearn.linear_model.RidgeCV': {
    },

    'xgboost.XGBRegressor': {
        'objective': ['reg:squarederror'],
        'n_estimators': [100],
        'max_depth': range(1, 11),
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'subsample': np.arange(0.05, 1.01, 0.05),
        'min_child_weight': range(1, 21),
        'nthread': [1]
    },

    # Preprocesssors
    'sklearn.preprocessing.Binarizer': {
        'threshold': np.arange(0.0, 1.01, 0.05)
    },


    'sklearn.cluster.FeatureAgglomeration': {
        'linkage': ['ward', 'complete', 'average'],
        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
    },

    'sklearn.preprocessing.MaxAbsScaler': {
    },

    'sklearn.preprocessing.MinMaxScaler': {
    },

    'sklearn.preprocessing.Normalizer': {
        'norm': ['l1', 'l2', 'max']
    },



    'sklearn.decomposition.PCA': {
        'svd_solver': ['randomized'],
        'iterated_power': range(1, 11)
    },

    'sklearn.preprocessing.PolynomialFeatures': {
        'degree': [2],
        'include_bias': [False],
        'interaction_only': [False]
    },


    'sklearn.preprocessing.RobustScaler': {
    },

    'sklearn.preprocessing.StandardScaler': {
    },

    'tpot.builtins.ZeroCount': {
    },

    'tpot.builtins.OneHotEncoder': {
        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
        'sparse': [False],
        'threshold': [10]
    },


    # Selectors
    'sklearn.feature_selection.SelectFwe': {
        'alpha': np.arange(0, 0.05, 0.001),
        'score_func': {
            'sklearn.feature_selection.f_regression': None
        }
    },

    'sklearn.feature_selection.SelectPercentile': {
        'percentile': range(1, 100),
        'score_func': {
            'sklearn.feature_selection.f_regression': None
        }
    },

    'sklearn.feature_selection.VarianceThreshold': {
        'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
    },

    'sklearn.feature_selection.SelectFromModel': {
        'threshold': np.arange(0, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesRegressor': {
                'n_estimators': [100],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    }

}

In [0]:
from tpot import TPOTRegressor

In [0]:
tpot = TPOTRegressor(generations=50,cv=13,population_size=100,
                    config_dict=config,
                    scoring='neg_mean_squared_error'
                    ,n_jobs=20,verbosity=2
                    ,crossover_rate=.15,mutation_rate=.85,warm_start=False)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=5100.0, style=ProgressStyle(d…


Generation 1 - Current best internal CV score: -5.576363431328948e-11
Generation 2 - Current best internal CV score: -5.576363431328948e-11
Generation 3 - Current best internal CV score: -5.576363431328948e-11
Generation 4 - Current best internal CV score: -5.576363431328948e-11
Generation 5 - Current best internal CV score: -5.576363431328948e-11
Generation 6 - Current best internal CV score: -5.576363431328948e-11
Generation 7 - Current best internal CV score: -5.3627948189870196e-11
Generation 8 - Current best internal CV score: -5.24033230531384e-11
Generation 9 - Current best internal CV score: -5.24033230531384e-11
Generation 10 - Current best internal CV score: -5.24033230531384e-11
Generation 11 - Current best internal CV score: -5.24033230531384e-11
Generation 12 - Current best internal CV score: -5.24033230531384e-11
Generation 13 - Current best internal CV score: -5.24033230531384e-11
Generation 14 - Current best internal CV score: -5.24033230531384e-11
Generation 15 - Curr

In [0]:
tpot.export('tpot_regressor.py')