In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.metrics import confusion_matrix,explained_variance_score,r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, validation_curve

In [2]:
diabetis_data = load_diabetes()
X = diabetis_data.data
y = diabetis_data.target

In [3]:
y = y.reshape(len(y),1)

In [4]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118


In [5]:
pd.DataFrame(X).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
0    442 non-null float64
1    442 non-null float64
2    442 non-null float64
3    442 non-null float64
4    442 non-null float64
5    442 non-null float64
6    442 non-null float64
7    442 non-null float64
8    442 non-null float64
9    442 non-null float64
dtypes: float64(10)
memory usage: 34.6 KB


In [6]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


In [62]:
def analyseData(regressor,X,y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1, random_state=10)
    
    #Min Max scale data
    scalar = MinMaxScaler(feature_range=(0,1), copy=False)
    scalar.fit(X_train)
    scalar.fit(X_test)
    scalar.fit(y_test)
    scalar.fit(y_train)
    regressor.fit(X_train,y_train)
    
    #Predicting and Plotting
    y_pred = regressor.predict(X_train)
    plt.figure()
    plt.plot(y_train,y_pred,'ro')
    plt.plot(y_train,y_train)
    plt.xlabel('Actual y')
    plt.ylabel('Predicted y')
    plt.title('Comparing ACtual and Predicted Outputs')
    plt.show()
    
    #Printing Scores and Accuracy
    print("\n\nRegression : ",regressor)
    print("Accuracy by mean_squared_error : ",mean_squared_error(y_train,y_pred))
    print("Accuracy by variance_eror :",explained_variance_score(y_train,y_pred))
    print("Accuracy by r2 Score : ",r2_score(y_train,y_pred))
    

In [69]:
def plot_feature_importances(feature_importance,title,feature_names):
    # NOrmalize importace Values
    feature_importance = 100*((feature_importance)/max(feature_importance))
    # Sort the index values and flip them so that they are arranged in decreasing order of importance
    index_sorted = np.flipud(np.argsort(feature_importance))
    
    # Center the location of the labels on the X-axis (for displaypurposes only)
    pos = np.arange(index_sorted.shape[0]) + 0.5
    
    #plot Bar Graph
    plt.figure()
    plt.bar(pos,feature_importance[index_sorted], align='center')
    plt.xticks(pos,feature_names[index_sorted])
    plt.ylabel('Relative Importance')
    plt.title(title)
    plt.show()

In [63]:
linear_reg = LinearRegression()
analyseData(linear_reg,X,y)

<IPython.core.display.Javascript object>



Regression :  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Accuracy by mean_squared_error :  2941.220761106941
Accuracy by variance_eror : 0.5024545125933497
Accuracy by r2 Score :  0.5024545125933497


In [32]:
ridge_reg = Ridge(alpha=0.2)
analyseData(ridge_reg,X,y)

<IPython.core.display.Javascript object>



Regression :  Ridge(alpha=0.2, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Accuracy by mean_squared_error :  3026.667734128127
Accuracy by variance_eror : 0.409793294691321


In [72]:
des_tree = DecisionTreeRegressor(max_depth=4)
analyseData(des_tree,X,y)
plot_feature_importances(des_tree.feature_importances_,'Descision Tree',diabetis_data.feature_names)

<IPython.core.display.Javascript object>



Regression :  DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
Accuracy by mean_squared_error :  2507.858788464921
Accuracy by variance_eror : 0.5757632885794586
Accuracy by r2 Score :  0.5757632885794586


<IPython.core.display.Javascript object>

TypeError: only integer scalar arrays can be converted to a scalar index

In [65]:
ran_forest = RandomForestRegressor(max_depth=4,max_features=10)
analyseData(ran_forest,X,y)

  # Remove the CWD from sys.path while we load stuff.


<IPython.core.display.Javascript object>



Regression :  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
Accuracy by mean_squared_error :  2332.2656305185515
Accuracy by variance_eror : 0.6056630041375487
Accuracy by r2 Score :  0.6054671396167466


In [66]:
ada_boost= AdaBoostRegressor(des_tree)
analyseData(ada_boost,X,y)

  y = column_or_1d(y, warn=True)


<IPython.core.display.Javascript object>



Regression :  AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=50,
         random_state=None)
Accuracy by mean_squared_error :  1792.4419662693315
Accuracy by variance_eror : 0.7019875597455194
Accuracy by r2 Score :  0.6967852860456594


In [67]:
grad_boost = GradientBoostingRegressor(alpha=0.4,learning_rate=0.01)
analyseData(grad_boost,X,y)

  y = column_or_1d(y, warn=True)


<IPython.core.display.Javascript object>



Regression :  GradientBoostingRegressor(alpha=0.4, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Accuracy by mean_squared_error :  3068.092624632469
Accuracy by variance_eror : 0.4809924979051895
Accuracy by r2 Score :  0.4809924979051896


In [68]:
extra_tree = ExtraTreeRegressor()
analyseData(extra_tree,X,y)

<IPython.core.display.Javascript object>



Regression :  ExtraTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
          max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, random_state=None,
          splitter='random')
Accuracy by mean_squared_error :  0.0
Accuracy by variance_eror : 1.0
Accuracy by r2 Score :  1.0
