In [1]:
config_type = 'website_content'
config_name = 'mac_and_cheese_long'


In [2]:
import json

import pandas as pd
import statsmodels.api as sm

from product_meta_analysis.analyze.regression import one_hot_encode_recipes
from product_meta_analysis.database.database import Database
from product_meta_analysis.utils import read_config


In [3]:
config = read_config(config_type, config_name)
ingredients = config.get('match').get('ingredients')
categories_to_combine = config.get('match').get('combine')

In [4]:
db = Database()
query = f"""         
    select
        url_id,
        content
    from website_content
    where content_type = 'schema_recipe_card_ingredients'
        and content is not null
    """
data = pd.DataFrame(db.read(query), columns=['urd_id', 'content'])
data['content'] = [json.loads(x) for x in data['content']]
data.head(5)

Unnamed: 0,urd_id,content
0,f4c88b0bf29044fc38ba48198cd7f1efde4e4ac768c001...,"{'ingredients': [{'name': None, 'amount': None..."
1,427c694669ec9066babdf14186f394d0cbbe892f94f007...,"{'ingredients': [{'name': None, 'amount': None..."
2,0f8746bfbca22f1fc21b8a34a9675ced476d77124ff1ed...,"{'ingredients': [{'name': None, 'amount': None..."
3,70a19651b5c117775ae4fa066a1ff85d20112b8eb5b46d...,"{'ingredients': [{'name': None, 'amount': None..."
4,f4d01f1b9c6ec3b3c7df86af0f4051d7c8164f0f631152...,"{'ingredients': [{'name': None, 'amount': None..."


### Encode data

In [5]:
def format_outcome(data): 
    data['outcome'] = [x.get('rating').get('rating') for x in data['content']]
    data = data[(data['outcome'] != 'None') & (data['outcome'].notnull())]
    data['outcome'] = data['outcome'].astype(float)
    return data

# TODO: Preforming a similar calculation in the general aggregate counts notebook; 
# maybe move this earlier on in process or just don't at all 
def combine_categories(data, categories_to_combine):
    data['max'] = 1
    for k, v in categories_to_combine.items(): 
        cols_to_drop = [x for x in v if x != k]
        data[k] = data[v].sum(axis=1)
        data[k] = data[[k, 'max']].min(axis=1)
        data = data.drop(cols_to_drop, axis=1)
    data = data.drop(['max'], axis=1)
    return data
   
def remove_empty_columns(data):
    not_empty = data.std() != 0
    good_cols = list(not_empty[not_empty].keys())
    return data[good_cols]


In [6]:
data_ = format_outcome(data)
outcome = data_['outcome']
features = one_hot_encode_recipes(data_['content'].tolist(), ingredients)
features = combine_categories(features, categories_to_combine)
features = remove_empty_columns(features)

features.sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome'] = data['outcome'].astype(float)


cheddar       73
swiss          1
gruyere        8
fontina        4
parmesan      14
jack          17
american       5
mozzarella    13
dtype: int64

### Build regression model

In [7]:
f = sm.add_constant(features.astype(float).to_numpy())
reg = sm.OLS(outcome, f)
results = reg.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                outcome   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                 -0.099
Method:                 Least Squares   F-statistic:                   0.09909
Date:                Mon, 04 Oct 2021   Prob (F-statistic):              0.999
Time:                        21:55:04   Log-Likelihood:                -289.42
No. Observations:                  81   AIC:                             596.8
Df Residuals:                      72   BIC:                             618.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.9903      3.501      1.425      0.1

### Examine diagostic plots