# Selecting the Best Model with Best Hyperparameters

In [None]:
# Import all the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import the preprocessing modules 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# import all the models which we have to use in this noteBook 
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# import the cross validation module 
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

# import the metrics to evalute the models 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [None]:
df = sns.load_dataset("tips")
df.head()

In [None]:
df.columns

# Regression Task

In [None]:
# now we can select the feature and targets or labels from the dataset
# features
X = df.drop('tip', axis=1)

# Targets 
y = df['tip']

# now, we encode all the categorical variables by using labelencoder
for col in X.columns:
    le = LabelEncoder()
    if X[col].dtype =='category' or X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

### Mean Absolute Error

In [None]:
# spliting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the list of models which we want to use 
models = [
    ("SVM", SVR()),
    ("XGBoost", XGBRegressor()),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Liner Regression", LinearRegression()),
    ("KNeighbors", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]
model_score = []
# fitting , predicting and evaluating the each model by using for loop 
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluating the model
    mar = mean_absolute_error(y_test, y_pred)
    model_score.append((name, mar))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Mean Absolute Error for {model[0]} is {model[1]:.2f}")

### Root Mean Squared Error

higher (high and low)
lower (high and low)

 


In [None]:
# spliting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the list of models which we want to compare
models = {
    "SVM": SVR(),
    "XGBoost": XGBRegressor(), 
    "Decision Tree": DecisionTreeRegressor(),
    "LinearRegression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
}

model_score = []

for name, model in models.items():
    # fitting the model to the training data
    model.fit(X_train, y_train)

    # predicting the model
    y_pred = model.predict(X_test)

    # evaluating the model by using Root Mean Squared Error (RMSE)
    rmse = root_mean_squared_error(y_test , y_pred)
    model_score.append((name, rmse))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Root Mean Squared Error for {model[0]} is {model[1]:.2f}")

In [None]:
models = {
    "SVM": (
        SVR(),      
        {
            'kernel': ['linear', 'poly', 'rbf'],
            'C': [1.0, 0.1, 0.01],
            'epsilon': [0.1, 0.01, 0.001],
            'gamma': ['auto'],
            'shrinking': [True, False],
            'cache_size': [50, 100, 200],
            'verbose': [True, False]
        }
    ),
    "Random Forest": (
        RandomForestRegressor(),
        {
            'n_estimators': [10, 100, 1000],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    ),
    "KNN": (
        KNeighborsRegressor(),
        {
            'n_neighbors': [5, 10, 15],
            'algorithm': ['kd_tree', 'ball_tree'],
            'weights': ['uniform', 'distance'],
        }
    ),
    "Decision Tree": (
        DecisionTreeRegressor(), 
        {
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10],
            'max_features': ['sqrt', 'log2', None]
        }
    ),
    "Linear Regression": (
        LinearRegression(), {}
    ),
    "XGBoost": (
        XGBRegressor(),
        {
            'n_estimators': [10, 100]
        }
    )
}

model_score = []

# loop through each model

for name, (model, param) in models.items():
    # create a pipeline
    pipeline = GridSearchCV(model, param, cv=5)

    # fit the pipeline to the training data
    pipeline.fit(X_train, y_train)

    # make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    model_score.append((name, mse))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Mean Squared Error for {model[0]} is {model[1]:.2f}")

In [None]:
def karperkar_routine(num):
    step = 0
    print(f'This is your Number: {num}')

    while num != 6174:
        num_str = f"{num:04d}"
        largest = ("".join(sorted(num_str, reverse=True)))
        smallest = ("".join(sorted(num_str)))
        num = largest - smallest
        print(f'Step {step}: {largest} - {smallest} = {num}')
    print("Reached 6174 in {step} steps🎉") 
# Get user Input       
user_input = input("Please Enter a 4 digit number:")
if user_input.isdigit() & len(user_input) == 4 &  len(set(int(user_input))) > 1:
    karperkar_routine(int(user_input))
else:
    print("Envalid User Input.4 digit number with at least 1 different number")