In [20]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Evaluate the models using cross-validation
results = {}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    results[name] = cv_scores.mean()

# Print the cross-validation scores
print('Cross-Validation Scores:')
for name, score in results.items():
    print(f'{name}: {score:.3f}')

# Select the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f'\nBest Model: {best_model_name}')

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
test_score = best_model.score(X_test, y_test)
print(f'Test Set Accuracy: {test_score:.3f}')

Cross-Validation Scores:
Decision Tree: 0.942
Random Forest: 0.942
Logistic Regression: 0.967
KNN: 0.942
SVM: 0.950
Naive Bayes: 0.942
Gradient Boosting: 0.942

Best Model: Logistic Regression
Test Set Accuracy: 1.000


In [21]:
# import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# define the models to be evaluated
models = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier(), SVC(), DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]
names = ['Logistic Regression', 'Random Forest', 'KNN', 'SVM', 'Decision Tree', 'Gradient Boosting', 'Naive Bayes']

# perform k-fold cross-validation for each model
k =10
for name, model in zip(names, models):
    cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring='accuracy')
    print(f'{name} CV accuracy: {np.mean(cv_scores):.3f} +/- {np.std(cv_scores):.3f}')

Logistic Regression CV accuracy: 0.950 +/- 0.085
Random Forest CV accuracy: 0.933 +/- 0.104
KNN CV accuracy: 0.942 +/- 0.065
SVM CV accuracy: 0.950 +/- 0.067
Decision Tree CV accuracy: 0.925 +/- 0.102
Gradient Boosting CV accuracy: 0.925 +/- 0.102
Naive Bayes CV accuracy: 0.942 +/- 0.075


In [30]:
SVC = SVC()

# get the parameters of SVC
print(SVC.get_params())

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [22]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'C' : [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
Best cross-validation score: 0.958


In [31]:
Random_Forest = RandomForestClassifier()

# get the parameters of Random Forest
print(Random_Forest.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [23]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 15, 20],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'max_depth': 5, 'n_estimators': 300}
Best cross-validation score: 0.958


In [32]:
Logistic_Reg = LogisticRegression()

# get the parameters of Logistic Regression
print(Logistic_Reg.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [24]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'C': 1, 'solver': 'sag'}
Best cross-validation score: 0.975


In [33]:
KNN = KNeighborsClassifier()

# get the parameters of KNN
print(KNN.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [25]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best cross-validation score: 0.958


In [34]:
Decision_Tree = DecisionTreeClassifier()

# get the parameters of Decision Tree
print(Decision_Tree.get_params())

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}


In [27]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 4, 6, 8],
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'max_depth': 7, 'min_samples_split': 2}
Best cross-validation score: 0.950


In [35]:
Gradient_Boost = GradientBoostingClassifier()

# get the parameters of Gradient Boosting
print(Gradient_Boost.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [28]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 9],
}

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'max_depth': 7, 'n_estimators': 100}
Best cross-validation score: 0.950


In [36]:
Gaussian_NB = GaussianNB()

# get the parameters of Gaussian Naive Bayes
print(Gaussian_NB.get_params())

{'priors': None, 'var_smoothing': 1e-09}


In [29]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters and their values
param_grid = {
    'var_smoothing': [1e-9, 1e-10, 1e-11, 1e-12],
}

grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.3f}')

Best parameters: {'var_smoothing': 1e-09}
Best cross-validation score: 0.942


In [37]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and their parameter grids
models = {
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [2, 4, 6, 8, 10]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [10, 50, 100, 200]}),
    'Logistic Regression': (LogisticRegression(), {'C': [0.1, 1, 10, 100]}),
    'KNN': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9, 11]}),
    'SVM': (SVC(), {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}),
    'Naive Bayes': (GaussianNB(), {'var_smoothing': [1e-9, 1e-10, 1e-11, 1e-12]}),
    'GradientBoostingClassifier': (GradientBoostingClassifier(), {'n_estimators': [100, 200, 300, 400], 'max_depth': [3, 5, 7, 9]})
}

# Perform GridSearchCV for each model
best_models = {}
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

# Evaluate the best models on the test set
results = {}
for name, model in best_models.items():
    test_score = model.score(X_test, y_test)
    results[name] = test_score

# Print the best models and their test set scores
print('Best Models and Test Set Scores:')
for name, score in results.items():
    print(f'{name}: {score:.3f}')

Best Models and Test Set Scores:
Decision Tree: 1.000
Random Forest: 1.000
Logistic Regression: 1.000
KNN: 1.000
SVM: 1.000
Naive Bayes: 1.000
GradientBoostingClassifier: 1.000


### Classification Models
#### 1. Logistic Regression
#### 2. K-Nearest Neighbors
#### 3. Support Vector Machines
#### 4. Decision Trees
#### 5. Random Forest
#### 6. Gradient Boosting
#### 7. XGBoost
#### 8. LightGBM
#### 9. CatBoost
#### 10. Neural Networks
#### 11. Naive Bayes
#### 12. Linear Discriminant Analysis
#### 13. Quadratic Discriminant Analysis
#### 14. AdaBoost
#### 15. Bagging
#### 16. Extra Trees
#### 17. Stochastic Gradient Descent
#### 18. Perceptron
#### 19. Passive Aggressive Classifier
#### 20. Ridge Classifier
#### 21. Nearest Centroid
#### 22. Gaussian Process
#### 23. MLP Classifier
#### 24. RBF SVM
#### 25. Linear SVM
#### 26. Nu-SVM
#### 27. One-Class SVM
#### 28. Isolation Forest
#### 29. Local Outlier Factor
#### 30. Elliptic Envelope
#### 31. One-Class SVM
#### 32. Local Outlier Factor
#### 33. Elliptic Envelope
#### 34. DBSCAN
#### 35. Birch
#### 36. Spectral Clustering
#### 37. Affinity Propagation
#### 38. Mean Shift
#### 39. Agglomerative Clustering
#### 40. K-Means
#### 41. Mini-Batch K-Means
#### 42. Gaussian Mixture Model
#### 43. Principal Component Analysis
#### 44. Kernel PCA
#### 45. Locally Linear Embedding
#### 46. MDS
#### 47. Isomap
#### 48. Spectral Embedding
#### 49. TSNE
#### 50. UMAP
#### 51. Autoencoder
#### 52. Variational Autoencoder
#### 53. Generative Adversarial Network
#### 54. Deep Belief Network
#### 55. Restricted Boltzmann Machine

### Regression Models
#### 1. Linear Regression
#### 2. Lasso
#### 3. Ridge
#### 4. Elastic Net
#### 5. Polynomial Regression
#### 6. Stochastic Gradient Descent
#### 7. K-Nearest Neighbors
#### 8. Decision Trees
#### 9. Random Forest
#### 10. Gradient Boosting
#### 11. XGBoost
#### 12. LightGBM
#### 13. CatBoost
#### 14. Neural Networks
#### 15. Support Vector Machines
#### 16. Gaussian Process
#### 17. AdaBoost
#### 18. Bagging
#### 19. Extra Trees
#### 20. Ridge Regression
#### 21. Bayesian Ridge
#### 22. ARD Regression
#### 23. Huber Regression
#### 24. RANSAC Regression
#### 25. TheilSen Regression
#### 26. Poisson Regression
#### 27. Tweedie Regression
#### 28. Gamma Regression
#### 29. Inverse Gaussian Regression
#### 30. Orthogonal Matching Pursuit
#### 31. Bayesian Ridge
#### 32. Automatic Relevance Determination
#### 33. Least Angle Regression
#### 34. LARS Lasso

In [47]:
# Linear Regression
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
df['age'] = df['age'].fillna(df['age'].mean())
X = df[['age']]
y = df['fare']

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# create the model
model = LinearRegression()

# fit the model
model.fit(X, y)

# define the hyperparameters and their values
param_grid = {'fit_intercept': [True, False]}

# object grid search cv (Creating the model)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

# fit and train the model
grid_search.fit(X, y)

# print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')

Best parameters: {'fit_intercept': True}


In [61]:
# KNN
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare',]]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# create the model
model = KNeighborsClassifier()

# fit the model
model.fit(X, y)

# define the hyperparameters and their values
param_grid = {'n_neighbors': np.arange(1,20,2), 'weights': ['uniform', 'distance'], 'p': [1, 2, 3]}

# object grid search cv (Creating the model)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)

# fit and train the model
grid_search.fit(X, y)

# print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best parameters: {'n_neighbors': 13, 'p': 1, 'weights': 'distance'}
Best Score: 0.6783777334912072


In [55]:
# scoring of the KNN model
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# For classification
model = LogisticRegression()
scores = cross_val_score(model, X, y, scoring='accuracy')  # Uses accuracy score
scores = cross_val_score(model, X, y, scoring='f1')  # Uses F1-score
scores = cross_val_score(model, X, y, scoring='roc_auc')  # Uses AUC-ROC

from sklearn.linear_model import LinearRegression

# For regression
model = LinearRegression()
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error')  # Uses negative MSE
scores = cross_val_score(model, X, y, scoring='r2')  # Uses R-squared

In [63]:
# DTC
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare',]]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# create the model
model = DecisionTreeClassifier()

# fit the model
model.fit(X, y)

# define the hyperparameters and their values
param_grid = {'max_depth': np.arange(1, 10), 'min_samples_split': np.arange(2, 10)}

# object grid search cv (Creating the model)
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='f1')

# fit and train the model
grid_search.fit(X, y)

# print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best parameters: {'max_depth': 6, 'min_samples_split': 8}
Best Score: 0.7447162294133822


In [64]:
# RF
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare',]]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# create the model
model = RandomForestClassifier()

# fit the model
model.fit(X, y)

# define the hyperparameters and their values
param_grid = {'n_estimators': [100, 200, 300, 400], 'max_depth': [3, 5, 7, 9]}

# object grid search cv (Creating the model)
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='f1')

# fit and train the model
grid_search.fit(X, y)

# print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best parameters: {'max_depth': 9, 'n_estimators': 300}
Best Score: 0.7667802426824764


In [67]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

# get the parameters of Gradient Boosting
print(gbc.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [70]:
# GBC
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare',]]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# create the model
model = GradientBoostingClassifier()

# fit the model
model.fit(X, y)

# define the hyperparameters and their values
param_grid = {
    'n_estimators': [200],
    'learning_rate': [0.1],
    'max_depth': [3],
    'min_samples_split': [5],
    'subsample': [0.8],
    'warm_start': [False]
}

# object grid search cv (Creating the model)
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='f1')

# fit and train the model
grid_search.fit(X, y)

# print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 0.8, 'warm_start': False}
Best Score: 0.7763292161970624
