# Report finding the correlation between Underweight/Overweight and Before/After cooking loss

In [5]:
import numpy as np
import pandas as pd

## Loading the Underweight/Overweight data

In [7]:
under_over_data = pd.read_csv('faith_under_over_weight.csv')
under_over_data['DATE OF PACKING'] = pd.to_datetime(under_over_data['DATE OF PACKING'])
under_over_data.drop(columns=['EXPIRY DATE', 'AVERAGE WEIGHT (g)'], inplace=True)
print(under_over_data)

    DATE OF PACKING  Positive Percentage  Negative Percentage
0        2023-10-02               81.250                0.000
1        2023-10-03               96.875                3.125
2        2023-10-03              100.000                0.000
3        2023-10-04               87.500                0.000
4        2023-10-04               84.375               15.625
..              ...                  ...                  ...
408      2024-10-18               59.375               21.875
409      2024-10-18               56.250               40.625
410      2024-10-22                3.125               96.875
411      2024-10-22               81.250               15.625
412      2024-10-24               84.375               15.625

[413 rows x 3 columns]


In [13]:
average_percentages = under_over_data.groupby('DATE OF PACKING')[['Positive Percentage', 'Negative Percentage']].mean().reset_index()
average_percentages['BATCH OUT OF STORAGE'] = average_percentages['DATE OF PACKING']
average_percentages.drop(columns=['DATE OF PACKING'], inplace=True)
average_percentages['BATCH OUT OF STORAGE'] = pd.to_datetime(average_percentages['BATCH OUT OF STORAGE'])

## Loading the loss data

In [10]:
loss_percentage = pd.read_csv('faith_cooking_batch_loss.csv')
loss_percentage['PRODUCTION DATE'] = pd.to_datetime(loss_percentage['PRODUCTION DATE'])

In [11]:
stored_data = pd.read_csv('faith_store.csv')
stored_data['PRODUCTION DATE'] = pd.to_datetime(stored_data['PRODUCTION DATE'], format='%m/%d/%Y')
stored_data['BATCH OUT OF STORAGE'] = pd.to_datetime(stored_data['BATCH OUT OF STORAGE'], format='%m/%d/%Y')

In [12]:
merged_data = pd.merge(loss_percentage, stored_data, on=['BATCH no.', 'PRODUCTION DATE'], how='inner')
print(merged_data)

       BATCH no. PRODUCTION DATE  BATCH WEIGHT (kg) BEFORE COOKING  \
0           1637      2023-10-09                             293.3   
1           1638      2023-10-09                             283.5   
2           1639      2023-10-09                             279.0   
3           1640      2023-10-09                             287.0   
4           1641      2023-10-09                             286.8   
...          ...             ...                               ...   
15037       4178      2024-10-04                             271.5   
15038       4180      2024-10-04                             270.8   
15039       4181      2024-10-04                             260.1   
15040       4182      2024-10-04                             257.2   
15041       4183      2024-10-04                             172.7   

       BATCH WEIGHT (kg) AFTER COOKING  LOSS PERCENTAGE BATCH OUT OF STORAGE  \
0                                286.0         2.488919           2023-10-09   

In [14]:
merged_data = pd.merge(merged_data, average_percentages, on='BATCH OUT OF STORAGE', how='inner')

## Figure out the correlation of the data

In [15]:
from scipy.stats import pearsonr, spearmanr
pearson_corr, _ = pearsonr(merged_data["LOSS PERCENTAGE"], merged_data["Negative Percentage"])
spearman_corr, _ = spearmanr(merged_data["LOSS PERCENTAGE"], merged_data["Negative Percentage"])

print(f"Pearson correlation: {pearson_corr:.2f}")
print(f"Spearman correlation: {spearman_corr:.2f}")

Pearson correlation: -0.01
Spearman correlation: 0.02


In [16]:
pearson_corr, _ = pearsonr(merged_data["LOSS PERCENTAGE"], merged_data["Positive Percentage"])
spearman_corr, _ = spearmanr(merged_data["LOSS PERCENTAGE"], merged_data["Positive Percentage"])

print(f"Pearson correlation: {pearson_corr:.2f}")
print(f"Spearman correlation: {spearman_corr:.2f}")

Pearson correlation: 0.02
Spearman correlation: -0.02


## Conclusion

There is no correlation between the loss during the cooking process and the underweight/overweight percentage at the last stage.

The factory have to provide more information about the process and statistical data