# Selecting the best model with best hyperparameters.


# Regressors

In [2]:
# import labraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import train_test_split
from sklearn.model_selection import train_test_split
# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error , r2_score , mean_absolute_error
# import gridsearchcv
from sklearn.model_selection import GridSearchCV
# import preprocessors 
from sklearn.preprocessing import StandardScaler , MinMaxScaler , LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# load tips data
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


# Regression Tasks

In [5]:
# select features and target variable
X = df.drop('tip' , axis=1)
y = df['tip']

# label encode the categorical columns
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])


In [6]:
# split the data into train and test sets
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=102)

# Create a dictionary of models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'XGBoost': XGBRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# train and evaluate each model
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, mse, r2, mae))

    # print evaluation metrics
    print(f"Model: {name}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print("--------------------")

# selecting the best model from all above models with evaluation metrics
sorted_model = sorted(model_scores, key=lambda x: x[1] , reverse=True)
for model in sorted_model:
    print('R2 score' , f'{model[0]} is {model[2]: .2f}') 

Model: Linear Regression
Mean Squared Error: 0.75
R-squared: 0.47
Mean Absolute Error: 0.66
--------------------
Model: Decision Tree
Mean Squared Error: 0.98
R-squared: 0.31
Mean Absolute Error: 0.76
--------------------
Model: Random Forest
Mean Squared Error: 0.74
R-squared: 0.48
Mean Absolute Error: 0.63
--------------------
Model: Support Vector Regression
Mean Squared Error: 0.68
R-squared: 0.52
Mean Absolute Error: 0.62
--------------------
Model: K-Nearest Neighbors
Mean Squared Error: 0.81
R-squared: 0.43
Mean Absolute Error: 0.68
--------------------
Model: XGBoost
Mean Squared Error: 0.71
R-squared: 0.50
Mean Absolute Error: 0.60
--------------------
Model: Gradient Boosting
Mean Squared Error: 0.61
R-squared: 0.57
Mean Absolute Error: 0.55
--------------------
R2 score Decision Tree is  0.31
R2 score K-Nearest Neighbors is  0.43
R2 score Linear Regression is  0.47
R2 score Random Forest is  0.48
R2 score XGBoost is  0.50
R2 score Support Vector Regression is  0.52
R2 score 

## Testing on Diamonds Dataset


In [7]:
# taking 1000 samples from diamonds dataset
df = sns.load_dataset('diamonds').sample(1000, random_state=42)
# select features and target variable
X = df.drop(['price'], axis=1)
y = df['price']

# label encode the categorical columns
le = LabelEncoder()
X['cut'] = le.fit_transform(X['cut'])
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])

In [None]:
# split the data into train and test sets
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=102)

# Create a dictionary of models to evaluate
models = {
    'Linear Regression': (LinearRegression() , {}),
    'Decision Tree': (DecisionTreeRegressor() , {'max_depth': [None , 10,20]}),
    'Random Forest': (RandomForestRegressor() , {'n_estimators': [10,  100]}),
    'Support Vector Regression': (SVR() , {'kernel': ['linear', 'rbf'],}),
    'K-Nearest Neighbors': (KNeighborsRegressor() , {'n_neighbors': np.arange(3,100,2)}),
    'XGBoost': (XGBRegressor() , {'n_estimators': [10,  100]})
    ,'Gradient Boosting': (GradientBoostingRegressor() , {'n_estimators': [10,  100]})
}

best_model_name = None  
best_model_score = float('inf')
best_model_params = {}

# train and evaluate each model
for name,(model , params) in models.items():
    # create a pipeline
    Pipeline = GridSearchCV(model , params , cv=5)
    Pipeline.fit(X_train , y_train)
    y_pred = Pipeline.predict(X_test)

    # calculate evaluation metrics
    mse = mean_squared_error(y_test , y_pred)
    r2 = r2_score(y_test , y_pred)
    print(f"Model: {name}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")
    print("--------------------")
    
# Check for the best model
    if mse < best_model_score:
        best_model_score = mse
        best_model_name = name
        best_model_params = Pipeline.best_params_

# Print the best model and parameters
print(f"Best Model: {best_model_name}")
print(f"Best Mean Squared Error: {best_model_score:.2f}")
print(f"Best Parameters: {best_model_params}")

Model: Linear Regression
Mean Squared Error: 1740556.07
R-squared: 0.89
--------------------
Model: Decision Tree
Mean Squared Error: 414426.09
R-squared: 0.97
--------------------
Model: Random Forest
Mean Squared Error: 294318.89
R-squared: 0.98
--------------------


# Classifiers

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()

Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

Classifier: SVM
Mean Accuracy: 0.9666666666666668

Classifier: KNN
Mean Accuracy: 0.9733333333333334

