
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Classifier training and performance assessment. </h2>	

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold
np.random.seed(42)

#### Retrieving and preprocessing of the training data

In [3]:
from ipynb.fs.defs.GradProject_NB2 import preprocess_part_one, preprocess_part_two

In [None]:
training_data = preprocess_part_one(None, True, True)
print('-'*50)
train_part, val_part = train_test_split(training_data, test_size=0.1)
k = 10
train, val = preprocess_part_two(train_part, val_part, k)

train_x, train_y = train.drop(columns=['Label']), train['Label']
val_x, val_y = val.drop(columns=['Label']), val['Label']


#### Scaling the data

In [None]:
scaler = MinMaxScaler()
train_x_scaled = scaler.fit_transform(train_x)
val_x_scaled = scaler.transform(val_x)

In [None]:
# Cross-validation
def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))

def compute_CV_rmse_and_acc(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset, 
        fit a model holding out that subset
        compute the MSE on that subset (the validation set)
    You should be fitting 5 models total.
    Return the average MSE of these 5 folds.

    Args:
        model: an sklearn model with fit and predict functions 
        X_train (data_frame): Training data
        Y_train (data_frame): Label 

    Return:
        the average validation error and accuracy for the 5 splits.
    '''
    kf = KFold(n_splits=5)
    validation_errors = []
    validation_accuracies = []
    
    for train_idx, valid_idx in kf.split(X_train):
        
        # Split the data
        split_X_train, split_X_valid = np.take(X_train, train_idx, axis=0), np.take(X_train, valid_idx, axis=0)
        split_Y_train, split_Y_valid = np.take(Y_train, train_idx, axis=0), np.take(Y_train, valid_idx, axis=0)
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        
        validation_errors.append(error)
        validation_accuracies.append(acc)
        
    return np.mean(validation_errors), np.mean(validation_accuracies)

#### Performing 5-fold cross validation to find optimal preprocessing parameters

In [5]:
import sys, os

# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__

for k in [10, 100, 200, 500]:
    for down_sample in [True, False]:
        for descriptor_limit in [6000, 10000, 30000]:
            #blockPrint()
            training_data = preprocess_part_one(None, True, down_sample)
            print('-'*50)
            train_part, val_part = train_test_split(training_data, test_size=0.1)
            train, val = preprocess_part_two(train_part, val_part, k, descriptor_limit)

            train_x, train_y = train.drop(columns=['Label']), train['Label']
            val_x, val_y = val.drop(columns=['Label']), val['Label']
            
            scaler = MinMaxScaler()
            train_x_scaled = scaler.fit_transform(train_x)
            val_x_scaled = scaler.transform(val_x)
            
            #enablePrint()
            print(f'**** k={k}, down_sample={down_sample}, decriptor_limit={descriptor_limit}****')
            
            model = LogisticRegression(max_iter=500)
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)
            print(type(model))
            error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
            print(f"Cross validation mean error: {error}")
            print(f"Cross validation mean accuracy: {acc}")
            print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")
            
            
            model = KNeighborsClassifier(10, weights='distance')
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)
            print(type(model))
            error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
            print(f"Cross validation mean error: {error}")
            print(f"Cross validation mean accuracy: {acc}")
            print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")
            
            model = DecisionTreeClassifier()
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)
            print(type(model))
            error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
            print(f"Cross validation mean error: {error}")
            print(f"Cross validation mean accuracy: {acc}")
            print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")
            
            model = RandomForestClassifier(n_estimators=800)
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)
            print(type(model))
            error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
            print(f"Cross validation mean error: {error}")
            print(f"Cross validation mean accuracy: {acc}")
            print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")
            
            model = SVC(kernel='rbf',C=10, gamma=0.01)
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)
            print(type(model))
            error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
            print(f"Cross validation mean error: {error}")
            print(f"Cross validation mean accuracy: {acc}")
            print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")

            

--------------------------------------------------
**** k=10, down_sample=True, decriptor_limit=6000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>




Cross validation mean error: 7.564900453192868
Cross validation mean accuracy: 0.19339662046705905
Test accuracy: 0.3515625

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 7.408910433907434
Cross validation mean accuracy: 0.20821720144294664
Test accuracy: 0.28125

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 7.086136365302795
Cross validation mean accuracy: 0.24131763812416937
Test accuracy: 0.3203125

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.491639674291127
Cross validation mean accuracy: 0.3868426048984241
Test accuracy: 0.5078125

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 7.724318422916485
Cross validation mean accuracy: 0.05052591608126068
Test accuracy: 0.359375

--------------------------------------------------
**** k=10, down_sample=True, decriptor_limit=10000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>




Cross validation mean error: 7.865535726399997
Cross validation mean accuracy: 0.19428137459654454
Test accuracy: 0.375

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 7.504420722867789
Cross validation mean accuracy: 0.2020201253085248
Test accuracy: 0.3359375

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.979720912750972
Cross validation mean accuracy: 0.26389975318017844
Test accuracy: 0.2421875

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.551981585916178
Cross validation mean accuracy: 0.40157205240174676
Test accuracy: 0.359375

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 9.892869916011657
Cross validation mean accuracy: 0.05574330738560851
Test accuracy: 0.3046875

--------------------------------------------------
**** k=10, down_sample=True, decriptor_limit=30000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>




Cross validation mean error: 7.613266701998924
Cross validation mean accuracy: 0.1899259540535409
Test accuracy: 0.34375

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 7.628791921141217
Cross validation mean accuracy: 0.19513575090184165
Test accuracy: 0.1796875

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 7.054126826741064
Cross validation mean accuracy: 0.2656806531232201
Test accuracy: 0.265625

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.445753448907732
Cross validation mean accuracy: 0.3937497626732485
Test accuracy: 0.3828125

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 9.335420279283289
Cross validation mean accuracy: 0.05053351053730777
Test accuracy: 0.296875

--------------------------------------------------
**** k=10, down_sample=False, decriptor_limit=6000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>




Cross validation mean error: 7.280924006374994
Cross validation mean accuracy: 0.2437037037037037
Test accuracy: 0.32450331125827814

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.644857283502295
Cross validation mean accuracy: 0.27555555555555555
Test accuracy: 0.24503311258278146

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.560789672808758
Cross validation mean accuracy: 0.30148148148148146
Test accuracy: 0.26490066225165565

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 5.837587140889077
Cross validation mean accuracy: 0.417037037037037
Test accuracy: 0.3973509933774834

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.549208995406965
Cross validation mean accuracy: 0.0962962962962963
Test accuracy: 0.3576158940397351

--------------------------------------------------
**** k=10, down_sample=False, decriptor_limit=10000****
<class 'sklearn.li



Cross validation mean error: 7.2008511032262374
Cross validation mean accuracy: 0.22962962962962963
Test accuracy: 0.3973509933774834

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.773797535748554
Cross validation mean accuracy: 0.24
Test accuracy: 0.31788079470198677

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.413069889944714
Cross validation mean accuracy: 0.30074074074074075
Test accuracy: 0.33112582781456956

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.159404615675863
Cross validation mean accuracy: 0.397037037037037
Test accuracy: 0.5099337748344371

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.632687332546181
Cross validation mean accuracy: 0.08148148148148149
Test accuracy: 0.4503311258278146

--------------------------------------------------
**** k=10, down_sample=False, decriptor_limit=30000****
<class 'sklearn.linear_model.lo



Cross validation mean error: 7.230799234657584
Cross validation mean accuracy: 0.23555555555555557
Test accuracy: 0.41721854304635764

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.884281930944598
Cross validation mean accuracy: 0.24074074074074076
Test accuracy: 0.2913907284768212

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.434260466155029
Cross validation mean accuracy: 0.30444444444444446
Test accuracy: 0.2781456953642384

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.059073881220181
Cross validation mean accuracy: 0.4022222222222222
Test accuracy: 0.44370860927152317

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.705335369574389
Cross validation mean accuracy: 0.08518518518518518
Test accuracy: 0.3576158940397351

--------------------------------------------------
**** k=100, down_sample=True, decriptor_limit=6000****
<class 'sklearn.l



Cross validation mean error: 7.109885049328733
Cross validation mean accuracy: 0.24740740740740744
Test accuracy: 0.39072847682119205

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.700158780273317
Cross validation mean accuracy: 0.25037037037037035
Test accuracy: 0.31125827814569534

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.661424110104913
Cross validation mean accuracy: 0.2681481481481482
Test accuracy: 0.31788079470198677

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 5.824437063079424
Cross validation mean accuracy: 0.42666666666666664
Test accuracy: 0.4105960264900662

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.7384535234863865
Cross validation mean accuracy: 0.08222222222222222
Test accuracy: 0.3509933774834437

--------------------------------------------------
**** k=100, down_sample=False, decriptor_limit=10000****
<class 'sklea



Cross validation mean error: 7.31311460349192
Cross validation mean accuracy: 0.23407407407407405
Test accuracy: 0.423841059602649

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.748210341254004
Cross validation mean accuracy: 0.24888888888888888
Test accuracy: 0.31125827814569534

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.587746070021751
Cross validation mean accuracy: 0.2896296296296296
Test accuracy: 0.2847682119205298

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 5.881787718094186
Cross validation mean accuracy: 0.42592592592592593
Test accuracy: 0.3973509933774834

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.577845691100394
Cross validation mean accuracy: 0.09555555555555556
Test accuracy: 0.4304635761589404

--------------------------------------------------
**** k=100, down_sample=False, decriptor_limit=30000****
<class 'sklearn.li



Cross validation mean error: 7.624042288794277
Cross validation mean accuracy: 0.1985988228593127
Test accuracy: 0.3125

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 7.468905633475191
Cross validation mean accuracy: 0.1881488513385229
Test accuracy: 0.2109375

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 7.020792899246162
Cross validation mean accuracy: 0.25172204290867667
Test accuracy: 0.2421875

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.455882089081625
Cross validation mean accuracy: 0.41283842794759823
Test accuracy: 0.421875

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 8.663346095791729
Cross validation mean accuracy: 0.04614011771406872
Test accuracy: 0.3515625

--------------------------------------------------
**** k=200, down_sample=True, decriptor_limit=30000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>




Cross validation mean error: 7.789096276310393
Cross validation mean accuracy: 0.19682931460034175
Test accuracy: 0.390625

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 7.690587139624661
Cross validation mean accuracy: 0.19249288019745586
Test accuracy: 0.25

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 7.083369797602562
Cross validation mean accuracy: 0.24564268084298463
Test accuracy: 0.2265625

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 6.288491934847082
Cross validation mean accuracy: 0.41372697930510727
Test accuracy: 0.3984375

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 8.49087102998101
Cross validation mean accuracy: 0.05662426428707044
Test accuracy: 0.40625

--------------------------------------------------
**** k=200, down_sample=False, decriptor_limit=6000****
<class 'sklearn.linear_model.logistic.LogisticRegression'>
Cross valida



Cross validation mean error: 7.305144037913399
Cross validation mean accuracy: 0.2185185185185185
Test accuracy: 0.3708609271523179

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.8496967977309975
Cross validation mean accuracy: 0.2511111111111111
Test accuracy: 0.18543046357615894

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.742369707142902
Cross validation mean accuracy: 0.2785185185185185
Test accuracy: 0.33774834437086093

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 5.907209544938572
Cross validation mean accuracy: 0.4133333333333334
Test accuracy: 0.44370860927152317

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.6676350093404455
Cross validation mean accuracy: 0.07925925925925927
Test accuracy: 0.3973509933774834

--------------------------------------------------
**** k=500, down_sample=False, decriptor_limit=10000****
<class 'sklearn



Cross validation mean error: 7.275942237784493
Cross validation mean accuracy: 0.22814814814814816
Test accuracy: 0.33774834437086093

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Cross validation mean error: 6.890992735653595
Cross validation mean accuracy: 0.24666666666666667
Test accuracy: 0.1390728476821192

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Cross validation mean error: 6.691308831475015
Cross validation mean accuracy: 0.29111111111111104
Test accuracy: 0.271523178807947

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Cross validation mean error: 5.675300685449613
Cross validation mean accuracy: 0.4325925925925926
Test accuracy: 0.3443708609271523

<class 'sklearn.svm.classes.SVC'>
Cross validation mean error: 5.55529357157152
Cross validation mean accuracy: 0.09481481481481482
Test accuracy: 0.3576158940397351



### Logistic Regression
##### Performing  5-fold cross validation for deciding hyper parameters

In [290]:
model = LogisticRegression(multi_class='multinomial', solver= 'lbfgs', penalty='l2', max_iter=1000)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.661046578704598
Accuracy: 0.3475



### K-nearest Neighbors
##### Predicting training set with 5-fold cross validation

In [292]:
model = KNeighborsClassifier(n_neighbors=10, weights='distance')

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.859133726178034
Accuracy: 0.2891666666666667



### Classification Tree

In [184]:
model = DecisionTreeClassifier()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 2.774773958756314
Accuracy: 0.7183333333333334



### Random Forest
##### Predicting training set with 5-fold cross validation

In [None]:
error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")



Mean RMSE: 6.934267455506964
Accuracy: 0.07416666666666667


Most importance features:
Feature name: stride_feat_54, Importance=0.004889143313085988
Feature name: stride_feat_73, Importance=0.004629313685160276
Feature name: stride_feat_74, Importance=0.004492257667117635
Feature name: stride_feat_49, Importance=0.004385448282853304
Feature name: stride_feat_28, Importance=0.004384583985446212
Feature name: stride_feat_97, Importance=0.00436587231331786
Feature name: AspectRatio, Importance=0.0043648847306119185
Feature name: stride_feat_72, Importance=0.004358104816712034
Feature name: stride_feat_91, Importance=0.004302289851897213
Feature name: stride_feat_108, Importance=0.004215192398012875
Feature name: stride_feat_56, Importance=0.004200022683300562
Feature name: stride_feat_111, Importance=0.004192143923326466
Feature name: stride_feat_124, Importance=0.0041783881997675566
Feature name: stride_feat_140, Importance=0.0041588178861251416
Feature name: stride_feat_50, Importance=0

In [None]:
s = model.feature_importances_
index_importance_sorted = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
top_index = index_importance_sorted[:50]

print("\nMost importance features:")
for index in top_index:
    print(f"Feature name: {train_x.columns[index]}, Importance={s[index]}")

#### Performing K-fold grid search to find optimal parameters

In [None]:
print(model.get_params())

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_x, train_y)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    preds = model.predict(test_features)
    accuracy = accuracy_score(preds, test_labels)
    print('Accuracy = {:0.2f}%.'.format(100*accuracy))
    return accuracy

base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(train_x, train_y)
base_accuracy = evaluate(base_model, val_x, val_y)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, val_x, val_y)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


In [303]:
model = SVC(kernel='rbf',C=10, gamma=0.01, decision_function_shape='ovo')

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.522211213377022
Accuracy: 0.08416666666666667



### Sources

* https://en.wikipedia.org/wiki/Logistic_regression
* https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest
* https://scikit-learn.org/stable/modules/tree.html#tree
* https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* https://en.wikipedia.org/wiki/Random_forest
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74