# Linear Regression

In [None]:
#import

from sklearn import datasets;
import pandas as pd;
import numpy as np;

In [None]:
#load data and target

diabetes_data = datasets.load_diabetes();

In [None]:
type(diabetes_data.data)

In [None]:
type(diabetes_data.target)

In [None]:
df_diabetes = pd.DataFrame(diabetes_data.data)
list(df_diabetes.columns.values)

In [None]:
df_diabetes['target'] = pd.Series(diabetes_data.target)
df_diabetes.head()

In [None]:
df_diabetes.shape

In [None]:
print(df_diabetes)

In [None]:
from sklearn import linear_model;

In [None]:
# load data as matrix
diabetes_mat = df_diabetes.as_matrix()

#split the feature matrix and traget variable 
diabetes_X = diabetes_mat[:, 0:-1]
diabetes_y = diabetes_mat[:,-1:]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-50]
diabetes_X_test = diabetes_X[-50:]

diabetes_y_train = diabetes_y[:-50]
diabetes_y_test = diabetes_y[-50:]

In [None]:
from sklearn import linear_model
from sklearn.metrics import *
import matplotlib.pyplot as plt
#Linear Regression Example
#This example uses the only the first feature of the diabetes dataset, in order to illustrate a two-dimensional plot of this regression technique. The straight line can be seen in the plot, showing how linear regression attempts to draw a straight line that will best minimize the residual sum of squares between the observed responses in the dataset, and the responses predicted by the linear approximation.

#The coefficients, the residual sum of squares and the variance score are also calculated.
#mean_squared_error, r2_score

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
diabetes_regr_model = regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = diabetes_regr_model.predict(diabetes_X_test)


In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))


In [None]:
# Plot outputs
fig1 = plt.figure()
plt.scatter(diabetes_y_test, diabetes_y_pred, color='black')
plt.plot([diabetes_y_test.min(), diabetes_y_test.max()], [diabetes_y_test.min(), diabetes_y_test.max()], 'k--', lw=3, color='blue')
plt.xticks(())
plt.yticks(())
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.show()
fig1.savefig('regression_ind_test_diabetes.png')   # save the figure to file
plt.close(fig1)   

# Cross Validation

### cross_val_predict returns an array of the same size as `y` where each entry
### is a prediction obtained by cross validation:
```
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import metrics

scores = cross_val_score(regr, diabetes_data.data, diabetes_data.target, cv=10)
diabetes_y_cv_pred = cross_val_predict(regr, diabetes_data.data, diabetes_data.target, cv=10)
scores
```

```
diabetes_y_cv_test = diabetes_data.target
import matplotlib.pyplot as plt
fig2 = plt.figure();
plt.scatter(diabetes_y_cv_test, diabetes_y_cv_pred, color='black')
plt.plot([diabetes_y_cv_test.min(), diabetes_y_cv_test.max()], [diabetes_y_cv_test.min(), diabetes_y_cv_test.max()], 'k--', lw=3, color='blue')
plt.xticks(())
plt.yticks(())
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.show()

fig2.savefig('regression_cv_diabetes.png')
plt.close(fig2)
```

# Classification

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import itertools

In [None]:
X, y = make_classification(n_samples=5000,n_features=500)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Support Vector Classification

In [None]:
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
clf_svc = SVC(kernel='linear', C=0.01)
y_pred_svc = clf_svc.fit(X_train, y_train).predict(X_test)

In [None]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_svc)
np.set_printoptions(precision=2)
cm

In [None]:
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ["0","1"], rotation=180)
plt.yticks(tick_marks, ["0","1"])
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# Multiple Classifiers

In [None]:
# Create different classifiers. The logistic regression cannot do
# multiclass out of the box.

classifiers = {'L1 logistic': LogisticRegression(C=1.0, penalty='l1'),
               'RDF': RandomForestClassifier(max_depth=3, n_estimators=100),
               'GBC': GradientBoostingClassifier(n_estimators=100)
               }

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X, y)

    y_pred = classifier.predict(X)
    f1 = f1_score(y,y_pred)
    print("F1 Score for %s : %f " % (name, f1))

    

In [None]:
gbc = GradientBoostingClassifier(n_estimators=100)
gbc_model =  gbc.fit(X,y)
rdf = RandomForestClassifier(n_estimators=100,max_depth=3)
rdf_model = rdf.fit(X,y)
logit = LogisticRegression(C=1.0, penalty='l1')
logit_model = logit.fit(X,y)

In [None]:
# The gradient boosted model by itself
y_pred_logit = logit_model.predict_proba(X_test)[:, 1]
fpr_logit, tpr_logit, _ = roc_curve(y_test, y_pred_logit)

# The gradient boosted model by itself
y_pred_gbc = gbc_model.predict_proba(X_test)[:, 1]
fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_pred_gbc)

# The random forest model by itself
y_pred_rf = rdf_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

In [None]:
fig3 = plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_logit, tpr_logit, label='Logit')
plt.plot(fpr_gbc, tpr_gbc, label='GBC')
plt.plot(fpr_rf, tpr_rf, label='RDF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
fig4 = plt.figure()
plt.xlim(0, 0.4)
plt.ylim(0.6, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_logit, tpr_logit, label='Logit')
plt.plot(fpr_gbc, tpr_gbc, label='GBC')
plt.plot(fpr_rf, tpr_rf, label='RDF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()


# Clustering

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)
X, y = make_classification(n_samples=2000,n_features=5,n_classes=3,n_informative=3,n_clusters_per_class=1)

estimators = [('k_means_3', KMeans(n_clusters=3))]

fignum = 1
titles = ['3 clusters']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2],
               c=labels.astype(np.float), edgecolor='k')

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('0')
    ax.set_ylabel('1')
    ax.set_zlabel('2')
    ax.set_title(titles[fignum - 1])
    ax.dist = 12
    fignum = fignum + 1

plt.show()