# Recipe Site Traffic

https://s3.amazonaws.com/talent-assets.datacamp.com/Practical+-+DSP+-+Recipe+Site+Traffic+-+2212.pdf

| Column Name   | Details                                                                                                        |
|---------------|----------------------------------------------------------------------------------------------------------------|
| recipe        | Numeric, unique identifier of recipe                                                                           |
| calories      | Numeric, number of calories                                                                                    |
| carbohydrate  | Numeric, amount of carbohydrates in grams                                                                      |
| sugar         | Numeric, amount of sugar in grams                                                                              |
| protein       | Numeric, amount of protein in grams                                                                            |
| category      | Character, type of recipe. Recipes are listed in one of ten possible groupings ('Lunch/Snacks', 'Beverages', 'Potato', 'Vegetable', 'Meat', 'Chicken', 'Pork', 'Dessert', 'Breakfast', 'One Dish Meal') |
| servings      | Numeric, number of servings for the recipe                                                                     |
| high_traffic  | Character, if the traffic to the site was high when this recipe was shown, this is marked with “High”          |

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

df = pd.read_csv('recipe_site_traffic_2212.csv')
df.head()

In [None]:
df.isna().sum()

There are null values in 4 independent variables and in the target. For the target its easy to get rid of the values due to the explanation included in the data dictionary. Lets make it a boolean True when it contains 'High'

In [None]:
df['high_traffic'] = df['high_traffic']=='High'

Lets understand a bit better whats going on with the other na's

In [None]:
df[df['calories'].isna()].head(20)

In [None]:
df.info()

In [None]:
df['category'] = df['category'].astype('category')

In [None]:
df['servings'][~pd.to_numeric(df['servings'], errors='coerce').notna()]

In [None]:
df['snack'] = df['servings'].str.contains('as a snack')
df['servings'] = df['servings'].str.replace(' as a snack', '')
df['servings'] = df['servings'].astype(int)

In [None]:
sns.boxplot(data=df, hue='category', y='calories')


In [None]:
sns.boxplot(data=df, hue='category', y='carbohydrate')

In [None]:
sns.boxplot(data=df, hue='category', y='sugar')

In [None]:
sns.boxplot(data=df, hue='category', y='protein')

In [None]:
df.groupby(df['sugar'].isna())['high_traffic'].value_counts(normalize=True)

In [None]:
for missing_col in ['calories', 'sugar', 'protein', 'carbohydrate']:
    df[missing_col] = df[missing_col].fillna(df.groupby(['category', 'servings'])[missing_col].transform('mean'))

In [None]:
df.head(10)

In [None]:
for col in ['calories', 'carbohydrate', 'sugar', 'protein']:
    df[col] = np.log1p(df[col])



In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 

numeric_cols = ['calories', 'sugar', 'protein', 'carbohydrate', 'servings']
numeric_cols_extra = []

for numeric_col in numeric_cols: 
    for numeric_col_2 in numeric_cols: 
        column_extra_name = numeric_col+'_'+numeric_col_2
        df[numeric_col+'_'+numeric_col_2] = df[numeric_col]*df[numeric_col_2]
        numeric_cols_extra.append(column_extra_name)

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df[numeric_cols_extra] = scaler.fit_transform(df[numeric_cols_extra])

df = pd.get_dummies(df, columns=['category'], drop_first=True)


In [None]:
df

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

X = df.drop('high_traffic', axis=1)
y = df['high_traffic'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = [LogisticRegression(penalty='l2', C=0.6, solver='liblinear'), 
          LogisticRegression(penalty='l1', C=0.6, solver='saga'), 
          LogisticRegression(),
          RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=5 ),
          RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_split=3 )
         ]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model)
    print(model.score(X_train, y_train))
    print(accuracy_score(y_pred, y_test))
    print(confusion_matrix(y_pred, y_test))



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Define the models and their parameter grids
models = {
    'logistic_regression': LogisticRegression(solver='liblinear', max_iter=200),
    'random_forest': RandomForestClassifier()
}

param_grids = {
    'logistic_regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],       # Regularization strength
        'penalty': ['l1', 'l2'],                   # Regularization type
        'class_weight': [None, 'balanced']         # Handles class imbalance
    },
    'random_forest': {
        'n_estimators': [50, 100, 200, 300],       # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],           # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4],             # Minimum number of samples required to be at a leaf node
        'max_features': ['sqrt', 'log2', None],    # Number of features to consider when looking for the best split
        'bootstrap': [True, False],                # Whether to use bootstrap sampling
        'class_weight': [None, 'balanced']         # Handles class imbalance
    }
}

# Dictionary to store the best models and scores
results = {}

# Perform grid search for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(
        model,
        param_grid=param_grids[model_name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the grid search on the training data
    grid_search.fit(X_train, y_train)
    
    # Store the best parameters and scores
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_estimator': grid_search.best_estimator_
    }
    
    # Calculate test accuracy
    test_accuracy = grid_search.score(X_test, y_test)
    results[model_name]['test_accuracy'] = test_accuracy

# Display results
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f" Best Parameters: {result['best_params']}")
    print(f" Best Cross-Validation Score: {result['best_score']}")
    print(f" Test Accuracy: {result['test_accuracy']}\n")