In [1]:
import seaborn as sns

# Load the tips dataset
tips_df = sns.load_dataset('tips')

# Display the first few rows of the dataset
print(tips_df.head())


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [2]:


# Separate categorical and numerical columns
categorical_cols = ['sex', 'smoker', 'day','time']
numerical_cols = ['size', 'total_bill', 'tip']

# Display the first few rows of the dataset
print(tips_df.head())


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Assuming you have the 'tips_df' DataFrame and 'numerical_cols' variable from the previous code

# Create a pipeline for numerical columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with mean
    ('scaler', StandardScaler())  # Standard scaling
])

# Apply the pipeline to the numerical columns
tips_df[numerical_cols] = numerical_pipeline.fit_transform(tips_df[numerical_cols])

# Display the first few rows of the preprocessed DataFrame
print(tips_df.head())


   total_bill       tip     sex smoker  day    time      size
0   -0.314711 -1.439947  Female     No  Sun  Dinner -0.600193
1   -1.063235 -0.969205    Male     No  Sun  Dinner  0.453383
2    0.137780  0.363356    Male     No  Sun  Dinner  0.453383
3    0.438315  0.225754    Male     No  Sun  Dinner -0.600193
4    0.540745  0.443020  Female     No  Sun  Dinner  1.506958


In [7]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Assuming you have the 'tips_df' DataFrame, 'categorical_cols', and 'numerical_cols' variables from the previous code

# Create a numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
    ('scaler', StandardScaler())  # Standard scaling
])

# Create a categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

# Apply the column transformer to the entire DataFrame
tips_df_transformed = preprocessor.fit_transform(tips_df)

# Convert the transformed array back to a DataFrame for display
columns = numerical_cols + preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols).tolist()
tips_df_transformed = pd.DataFrame(tips_df_transformed, columns=columns)

# Display the first few rows of the preprocessed DataFrame
print(tips_df_transformed.head())


       size  total_bill       tip  sex_Female  sex_Male  smoker_No  \
0 -0.600193   -0.314711 -1.439947         1.0       0.0        1.0   
1  0.453383   -1.063235 -0.969205         0.0       1.0        1.0   
2  0.453383    0.137780  0.363356         0.0       1.0        1.0   
3 -0.600193    0.438315  0.225754         0.0       1.0        1.0   
4  1.506958    0.540745  0.443020         1.0       0.0        1.0   

   smoker_Yes  day_Fri  day_Sat  day_Sun  day_Thur  
0         0.0      0.0      0.0      1.0       0.0  
1         0.0      0.0      0.0      1.0       0.0  
2         0.0      0.0      0.0      1.0       0.0  
3         0.0      0.0      0.0      1.0       0.0  
4         0.0      0.0      0.0      1.0       0.0  


In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = tips_df_transformed
y = tips_df['time']

# Split the dataset into a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder



# Assuming you have the 'tips_df_transformed' DataFrame from the previous code
X = tips_df_transformed
y = tips_df['time']  # This is the target variable you want to predict

# Split the dataset into a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a new pipeline for the classifier
classifier_pipeline = Pipeline([
    ('classifier', RandomForestClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [5, 10, 15, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=classifier_pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found by the grid search
print("Best Parameters:", grid_search.best_params_)

# Get the best model from the grid search
best_rf_model = grid_search.best_estimator_

# Evaluate the performance of the best model on the test set
best_predictions = best_rf_model.predict(X_test)

# Print evaluation metrics
accuracy = accuracy_score(y_test, best_predictions)
precision = precision_score(y_test, best_predictions, average='weighted')
recall = recall_score(y_test, best_predictions, average='weighted')
f1 = f1_score(y_test, best_predictions, average='weighted')

print("\nPerformance of the Best Model:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 150}

Performance of the Best Model:
Accuracy: 0.9730
Precision: 0.9730
Recall: 0.9730
F1 Score: 0.9730
