## Data Analysis Mathematics, Algorithms and Modeling

# AI Powered Recipe Recommendation System 

### Team : Group 3
| Student No  | First Name                  | Last Name     |
|-------------|-----------------------------|---------------|
| 9041129     | Nidhi                       | Ahir          |
| 9016986     | Keerthi                     | Gonuguntla    |
| 9027375     | Khushbu                     | Lad           |

#### Introduction

In the next phase of recepe recommandation system, Data related to user feedback and rating are taken into consideration with the existing recepe data with a view to identify insights about user's preferences and engagement with the recepe. This will help to identify corelation between recepe characteristics and user preferences 

### Rectangular dataset : Raw_interaction.csv

Import Libraries

In [28]:
import numpy as np
import pandas as pd 
from scipy import stats
import matplotlib.pyplot as plt
import scipy.stats as zscore

#### Representing the new data set in classes and methods

In [None]:
class RawRecipe:
    def __init__(self):
        self.file_path = './Dataset/RAW_recipes.csv'
        self.data = None
    
    # Loads the data from a CSV file.
    def load_data(self):
        self.data = pd.read_csv(self.file_path)
        print(f"---> STEP 1 : Loads the data from a CSV file. \r\n")
        print(f"RAW_recipes.csv : Data loaded successfully.")
        print(f"Total Records : {self.data.shape[0]} \r\n")
        return self.data
    
class RecepeInteraction:
    def __init__(self):
        self.file_path = './Dataset/RAW_interactions.csv'
        self.data = None
    
    # Loads the data from a CSV file.
    def load_data(self):
        self.data = pd.read_csv(self.file_path)
        print(f"---> STEP 1 : Loads the data from a CSV file. \r\n")
        print(f"RAW_interactions.csv : Data loaded successfully.")
        print(f"Total Records : {self.data.shape[0]} \r\n")
        return self.data
    
    def view_sample_data(self):
        self.data.head(5)

    # Data quality : Null Check
    def check_null_values(self):
        print(f"---> STEP 2 : Null Check for data \r\n")
        if self.data is not None:
            nulls = self.data.isnull().sum()
            print(nulls)
            return nulls
        else:
            print("Data not loaded.")
     # Data quality : Duplicate Check
    def check_duplicate_values(self):
        print(f"\r\n---> STEP 3 : Duplicate data Check for recepe \r\n")
        if self.data is not None:
            counts = self.data["recipe_id"].value_counts()
            dupl = (counts[counts>1]).reset_index()
            dupl.columns = ["recipe_id", "Count"]
            print(dupl)
            return dupl
        else:
            print("Data not loaded.")

if __name__ == "__main__":

    # Create an instance of the RecepeInteraction  class and load data
    interactionData = RecepeInteraction()
    interactionData.load_data()

    # Create an instance of the RecepeInteraction  class and load data
    recepeData = RawRecipe()
    recepeData.load_data()


In [None]:
interactionData.data.head(5)

This dataset appears to contain reviews and ratings for various recipes. Here's a breakdown of each column:

**user_id:** Unique identifier for the user who provided the rating/review.

**recipe_id:** Unique identifier for the recipe being rated/reviewed.

**date:** Date when the rating and review were provided.

**rating:** Numerical rating (on a scale of 0 to 5) given to the recipe.

**review:** User's textual review providing additional feedback or modifications to the recipe.

In [None]:

# Check for missing values
interactionData.check_null_values()

# Check duplicate values
interactionData.check_duplicate_values()

In [None]:
# Merge data using common field recepe Id
merged_data = pd.merge(recepeData.data, interactionData.data, left_on='id', right_on='recipe_id')
print("Data Merged Successfully")
merged_data.head(2)

### Hypothisis : Recipe rating is propotional to preperation time i.e more the preperation time , more ratings recepe get

### QQ Normal Plot

In [None]:


minutes = recepeData.data["minutes"] 
rating= interactionData.data["rating"]

def create_qq_plot(ax,data, title, line_color, line_width, point_color, point_size):
    (osm, osr), (slope, intercept, r) = zscore.probplot(data, dist="norm")
    
    # Plot the data points
    ax.scatter(osm, osr, color=point_color, s=point_size, label='Data Points')
    
    # Plot the fit line
    ax.plot(osm, slope * osm + intercept, color=line_color, lw=line_width, label='Fit Line')
    
    # Set title and labels for the subplot (ax)
    ax.set_title(f'QQ Plot of {title}')
    ax.set_xlabel('Theoretical Quantiles ' + title)
    ax.set_ylabel('Sample Quantiles ' + title)
    
    # Add legend and grid
    ax.legend()
    ax.grid(True)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

# QQ Plot for minutes
create_qq_plot(axes[0],minutes,'minutes',"blue",1,"skyblue",3)

# QQ Plot for Rating
create_qq_plot(axes[1],rating, 'Rating',"red",1,"yellow",5)

## Correlation 


In [None]:
correlation = merged_data['minutes'].corr(merged_data['rating'])

print(f'Correlation coefficient between Variable1 and Variable2: {correlation}')

# Optionally, visualize the relationship using a scatter plot
# sns.scatterplot(x='id', y='rating', data=merged_data)
# plt.title('Scatter Plot of id vs rating')
# plt.xlabel('id')
# plt.ylabel('ratings')
# plt.show()

df = merged_data.apply(pd.to_numeric, errors = 'coerce')
corr_mat = df.corr()
print(corr_mat)

In [None]:
Zscore_steps = zscore.zscore(merged_data['rating'])
print(f"Z-Score for the steps\n ",Zscore_steps)

In [None]:
sample_mean = merged_data['rating'].mean()
sample_std = merged_data['rating'].std()
n = len(merged_data['rating'])

# Specify the population mean
population_mean = 0 

# Calculate the t-score
t_score = (sample_mean - population_mean) / (sample_std / np.sqrt(n))

print(f"T-score for {'rating'}: {t_score}")

### Shapiro-Wilk Normality Test

In [None]:
# Select only numeric columns for the Shapiro-Wilk test
numeric_columns = merged_data.select_dtypes(include='number').columns

# Apply the Shapiro-Wilk test for normality on each numeric column
shapiro_results = {}

for column in numeric_columns:
    statistic, p_value = stats.shapiro(merged_data['rating'].dropna())  # Use dropna() to remove NaN values
    shapiro_results['rating'] = p_value

# Convert results to DataFrame for better visualization
shapiro_results_df = pd.DataFrame(list(shapiro_results.items()), columns=['rating', 'p_value'])

# Display the Shapiro-Wilk test results
print(shapiro_results_df)

numeric_columns = merged_data.select_dtypes(include='number').columns

# Apply the Shapiro-Wilk test for normality on each numeric column
shapiro_results = {}

for column in numeric_columns:
    statistic, p_value = stats.shapiro(merged_data['minutes'].dropna())  # Use dropna() to remove NaN values
    shapiro_results['minutes'] = p_value

# Convert results to DataFrame for better visualization
shapiro_results_df = pd.DataFrame(list(shapiro_results.items()), columns=['minutes', 'p_value'])

# Display the Shapiro-Wilk test results
print(shapiro_results_df)

#### Here P-Values is less than 0.05 for the both ratings and minutes so that our dta is normally distributed.


### F-TEST :

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Sample data creation (assuming these are your dataframes)
# raw_recipes = pd.read_csv('path_to_your_recipes.csv')
# raw_interaction = pd.read_csv('path_to_your_interaction.csv')

# For demonstration, let's create dummy data
# Assuming these are your numerical columns of interest


# Extract the data to be tested
group1 = merged_data['minutes']
group2 = merged_data['rating']

# Perform the F-test
f_statistic, p_value = stats.levene(group1, group2)  # Use Levene's test for equality of variances

# Display the results
print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value <= alpha:
    print("Reject the null hypothesis: Variances are significantly different.")
else:
    print("Accept the null hypothesis: Variances are not significantly different.")


### Conclusion : 

### Wilcox Test 

In [None]:
data1 = merged_data['minutes']
data2 = merged_data['rating']

# Perform Wilcoxon Signed-Rank Test
stat, p_value = stats.wilcoxon(data1, data2)

print('Wilcoxon Signed-Rank Test Statistic:', stat)
print('P-value:', p_value)

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the two related samples.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the two related samples.")

## CONCLUSION

Ratings are NOT propotional to Preperation time 