## Cross Validation

In [None]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

### train/test split

In [None]:
columns = 'age sex bmi map tc ldl hdl tch ltg glu'.split()
diabetes = datasets.load_diabetes()

X = pd.DataFrame(diabetes.data, columns=columns)
y = diabetes.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test, predictions)
plt.xlabel("True values")
plt.ylabel("Predictions")

In [None]:
model.score(X_test,y_test)

In [None]:
u = sum((predictions - y_test)**2)
v = sum((y_test - y_test.mean())**2)
1 - u/v

### K-Folds CV and Leave One Out CV (LOOCV)

In [None]:
from sklearn.model_selection import KFold # import KFold

### Simple example -- `KFold` object

In [None]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) # create an array
y = np.array([1, 2, 3, 4]) # Create another array
kf = KFold(n_splits=2) # Define the split - into 2 folds 
print(kf.get_n_splits(X)) # returns the number of splitting iterations in the cross-validator
print(kf) 
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

### `LeaveOneOut` object

In [None]:
from sklearn.model_selection import LeaveOneOut 

In [None]:
loo = LeaveOneOut()
print(loo)
print(loo.get_n_splits(X))

for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

In [None]:
X = pd.DataFrame(diabetes.data, columns=columns)
y = diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
kf = KFold(n_splits=10)
for train_id, test_id in kf.split(X):
    print("TRAIN:", train_id, "TEST:", test_id)
    X_train, X_test = X.loc[train_id,:], X.loc[test_id,:]
    y_train, y_test = y[train_id], y[test_id]

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import metrics

In [None]:
scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validated scores:", scores)

In [None]:
predictions = cross_val_predict(model, X_train, y_train, cv=5)
plt.scatter(y_train, predictions)

In [None]:
accuracy = metrics.r2_score(y_train, predictions)
print("Cross-Predicted Accuracy:", accuracy)

In [None]:
cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.3, random_state=1)

In [None]:
scores2 = cross_val_score(model, X_train, y_train, cv=cv)
scores2

### Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

## Machine learning models -- LDA and PCA

In [None]:
from sklearn.datasets import load_wine
np.set_printoptions(precision=4)
import seaborn as sns
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [None]:
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Categorical.from_codes(wine.target, wine.target_names)
df = X.join(pd.Series(y, name='class'))

In [None]:
print(wine['DESCR'])

In [None]:
wine.target_names

In [None]:
df.head(3)

In [None]:
class_feature_means = pd.DataFrame(columns=wine.target_names)

In [None]:
for c, rows in df.groupby('class'):
    class_feature_means[c] = rows.mean()
class_feature_means

### Within Class Scatter Matrix

within_class_scatter_matrix = np.zeros((13,13))

In [None]:
for c, rows in df.groupby('class'):
    rows = rows.drop(['class'], axis=1)

    s = np.zeros((13,13))
    for index, row in rows.iterrows():
        x, mc = row.values.reshape(13,1), class_feature_means[c].values.reshape(13,1)
        s += (x - mc).dot((x - mc).T)

    within_class_scatter_matrix += s


### Between Class Scatter Matrix

In [None]:
feature_means = df.mean()

between_class_scatter_matrix = np.zeros((13,13))

In [None]:
for c in class_feature_means:    
    n = len(df.loc[df['class'] == c].index)
    
    mc, m = class_feature_means[c].values.reshape(13,1), feature_means.values.reshape(13,1)
    
    between_class_scatter_matrix += n * (mc - m).dot((mc - m).T)

### Sort the eigenvalues of $W^{-1}B$

In [None]:
eigen_values, eigen_vectors = np.linalg.eig(
    np.linalg.inv(within_class_scatter_matrix) @ (between_class_scatter_matrix))

In [None]:
pairs = [(np.abs(eigen_values[i]), 
          eigen_vectors[:,i])
         for i in range(len(eigen_values))]
pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
for pair in pairs:
    print(pair[0])

In [None]:
eigen_value_sums = sum(eigen_values)
print('Explained Variance')
for i, pair in enumerate(pairs):
    print('Eigenvector {}: {}'.format(i, (pair[0]/eigen_value_sums).real))

### scatter matrix consisting of the first two eigenvectors

In [None]:
w_matrix = np.hstack((pairs[0][1].reshape(13,1), pairs[1][1].reshape(13,1))).real

In [None]:
X_lda = np.array(X @ w_matrix)
X_lda

In [None]:
le = LabelEncoder()
y = le.fit_transform(df['class'])

In [None]:
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.scatter(
    X_lda[:,0],
    X_lda[:,1],
    c=y,
    cmap='rainbow',
    alpha=0.7,
    edgecolors='b'
)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
lda.explained_variance_ratio_

In [None]:
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.scatter(
    X_lda[:,0],
    X_lda[:,1],
    c=y,
    cmap='rainbow',
    alpha=0.7,
    edgecolors='b'
)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X, y)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=y,
    cmap='rainbow',
    alpha=0.7,
    edgecolors='b'
)

### Classification using DTC and LDA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, 
                                                    test_size=0.3,
                                                    random_state=1)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
confusion_matrix(y_test, y_pred)

## Pipelines

### Sequentially apply a list of transforms and a final estimator. Intermediate steps of pipeline must implement fit and transform methods and the final estimator only needs to implement fit.

### using Analytics Vidhya's load prediction dataset

In [246]:
train = pd.read_csv('train.csv')
print(train.head(5))
test = pd.read_csv('test.csv')
train = train.drop('Loan_ID', axis=1)
train.dtypes

    Loan_ID Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
0  LP001002   Male      No          0      Graduate            No             5849   
1  LP001003   Male     Yes          1      Graduate            No             4583   
2  LP001005   Male     Yes          0      Graduate           Yes             3000   
3  LP001006   Male     Yes          0  Not Graduate            No             2583   
4  LP001008   Male      No          0      Graduate            No             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History Property_Area Loan_Status  
0                0.0         NaN             360.0             1.0         Urban           Y  
1             1508.0       128.0             360.0             1.0         Rural           N  
2                0.0        66.0             360.0             1.0         Urban           Y  
3             2358.0       120.0             360.0             1.0         Urban           Y  
4       

In [237]:
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']

In [238]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [239]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### The first step in building the pipeline is to define each transformer type. The convention here is generally to create transformers for the different variable types.

In [240]:
numeric_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [249]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'],
                                                                    axis=1).columns

In [251]:
from sklearn.compose import ColumnTransformer

In [252]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [253]:
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)


In [254]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [255]:
rf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [256]:
y_pred = rf.predict(X_test)

## Model selection

In [258]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
model score: 0.764
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.691
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=None,
      shrinking=True, tol=0.001, verbose=False)
model score: 0.797
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_sa



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
model score: 0.797
GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
model score: 0.764


## Parameter tuning

In [259]:
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']}
from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(rf, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)



{'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 200}
0.8126272912423625
