In [22]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [23]:
irish = load_iris()

In [24]:
dir(irish)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [25]:
df = pd.DataFrame(irish.data, columns = irish.feature_names)

In [26]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [27]:
df['flowers'] = irish.target

In [28]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flowers
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [29]:
df['flowers'] = df['flowers'].apply(lambda x : irish.target_names[x])

In [30]:
df[95:105]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flowers
95,5.7,3.0,4.2,1.2,versicolor
96,5.7,2.9,4.2,1.3,versicolor
97,6.2,2.9,4.3,1.3,versicolor
98,5.1,2.5,3.0,1.1,versicolor
99,5.7,2.8,4.1,1.3,versicolor
100,6.3,3.3,6.0,2.5,virginica
101,5.8,2.7,5.1,1.9,virginica
102,7.1,3.0,5.9,2.1,virginica
103,6.3,2.9,5.6,1.8,virginica
104,6.5,3.0,5.8,2.2,virginica


In [31]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(irish.data, irish.target, test_size = 0.3)

In [35]:
len(x_train)

105

# let's try to test with Support Vector Classifier

In [36]:
svm = SVC(C = 1, gamma = 'auto', kernel='rbf')

In [37]:
svm.fit(x_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
svm.score(x_test, y_test)

1.0

## Now let's try to test with Logistic Regression

In [39]:
log_reg = LogisticRegression()

In [40]:
log_reg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
log_reg.score(x_test, y_test)

1.0

# Now let's try to test with Decision Tree

In [42]:
dec_tree = DecisionTreeClassifier()

In [43]:
dec_tree.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [44]:
dec_tree.score(x_test, y_test)

0.9777777777777777

# Let's try with Multinomial Bayes

In [45]:
mal_byes = MultinomialNB(alpha = 1)

In [46]:
mal_byes.fit(x_train, y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [47]:
mal_byes.score(x_test, y_test)

0.6222222222222222

From above results it clerifies that Every model gives different score results, and while changing the parameter it changes the score. so how we are going to choose the right model and right parameter for our data?


In order to find the right solution we have one model which will help us to choose the right model and we will be able to predict the right score! it is called
# 'GridSreachCV' .

Let's take an example of one Model and use that model to find the right parameter of that model. SO we can predict the accurate result and score.

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
clf = GridSearchCV(SVC(gamma = 'auto'), {
    'C' : [1, 10, 20],
    'kernel' : ['linear', 'rbf']
}, cv = 5, return_train_score = False)
clf.fit(irish.data, irish.target)
clf.cv_results_

{'mean_fit_time': array([0.0006578 , 0.00065141, 0.00059729, 0.00061612, 0.00052772,
        0.00067506]),
 'mean_score_time': array([0.00038166, 0.00035372, 0.00037947, 0.00034304, 0.00033674,
        0.00034742]),
 'mean_test_score': array([0.98      , 0.98      , 0.97333333, 0.98      , 0.96666667,
        0.96666667]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'}],
 'rank_test_score': array([1, 1, 4, 1, 6, 5], dtype=int32),
 'split0_test_score': array([0.96666667, 0.96

In [50]:
df_1 = pd.DataFrame(clf.cv_results_)

In [51]:
df_1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000658,0.000171,0.000382,0.000148,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000651,0.00011,0.000354,6e-05,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000597,0.0001,0.000379,7.3e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.000616,5.1e-05,0.000343,3.4e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.000528,4.6e-05,0.000337,2.9e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.000675,0.00012,0.000347,1.6e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [52]:
df_1[['param_C', 'params', 'mean_test_score']]

Unnamed: 0,param_C,params,mean_test_score
0,1,"{'C': 1, 'kernel': 'linear'}",0.98
1,1,"{'C': 1, 'kernel': 'rbf'}",0.98
2,10,"{'C': 10, 'kernel': 'linear'}",0.973333
3,10,"{'C': 10, 'kernel': 'rbf'}",0.98
4,20,"{'C': 20, 'kernel': 'linear'}",0.966667
5,20,"{'C': 20, 'kernel': 'rbf'}",0.966667


In [53]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [54]:
clf.best_score_

0.9800000000000001

This is how we can find the right parameter amd get the best score result But there is also another way to find the right parameters and score! and that's is called 
## RandomizedSearchCV

In [55]:
from sklearn.model_selection import RandomizedSearchCV

In [56]:
clf_1 = RandomizedSearchCV(SVC(gamma='auto'), {
    'C' : [1, 10, 20], 
    'kernel' : ['rbf', 'linear']
}, cv = 5, return_train_score = False, n_iter = 3)
clf_1.fit(irish.data, irish.target)
df_2 = pd.DataFrame(clf_1.cv_results_)

In [57]:
df_2[['param_C', 'params', 'mean_test_score']]

Unnamed: 0,param_C,params,mean_test_score
0,1,"{'kernel': 'rbf', 'C': 1}",0.98
1,10,"{'kernel': 'rbf', 'C': 10}",0.98
2,20,"{'kernel': 'linear', 'C': 20}",0.966667


There is another example of finding the right parameter & best test Score and it is also efficient and time saving. let me show you the example

In [59]:
model_param = {
    'svm' : {
        'model': SVC(gamma = 'auto'),
        'params' : {
            'C' : [1, 10, 20],
            'kernel' : ['rbf', 'linear']
        }
    },

    'ramdom_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [1, 5, 10]
        }
    },

    'log_reg' : {
        'model' : LogisticRegression(solver = 'liblinear', multi_class = 'auto'),
        'params' : {
            'C' : [1, 5, 20]
        }
    },
    'naive_bayes_multi' : {
        'model' : MultinomialNB(),
        'params': {}
    },

    'naive_bayes_gaussi' : {
        'model' : GaussianNB(),
        'params': {}
    },

    'decision_tree' : {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion' : ['gini', 'entropy']
        }
    }

}

In [60]:
score = []

for model_name, mp in model_param.items():
  clf = GridSearchCV(mp['model'], mp['params'], cv = 5, return_train_score=False)
  clf.fit(irish.data, irish.target)
  score.append({
      'model': model_name,
      'best_score': clf.best_score_,
      'best_params': clf.best_params_
  })

In [61]:
df = pd.DataFrame(score, columns = ['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,ramdom_forest,0.96,{'n_estimators': 10}
2,log_reg,0.966667,{'C': 5}
3,naive_bayes_multi,0.953333,{}
4,naive_bayes_gaussi,0.953333,{}
5,decision_tree,0.96,{'criterion': 'gini'}


### So here is the final result of getting right parameters and score, and this the perfect technique to find accurate and best result for our data prediction And clearly shows that which model should we use to predict our Data model for Business Decision or to find a certain solution!

In [62]:
df.to_csv("best_model_for_prediction.csv")