In [8]:
import pandas as pd
import numpy as np
import uuid
import joblib
from datetime import date

from lib.experience_data import pull_explanatory_variables, create_model, run_predictions
from lib.experience_data import process_data_for_analysis, get_latest_joblib
from lib.config import app, db
from lib.models import Hive, ModelHistory

Functions

Clean Hive Data for Study

In [2]:
with app.app_context():
    hives = [hive.to_dict() for hive in Hive.query.all()]

In [3]:
df_normalized, df_aggregated = process_data_for_analysis(hives)

df_aggregated.head()

Unnamed: 0,hive_id,date_added,city,state,honey_pull_id,date_reset,weight,date_pulled,count,temp,...,wasps_hornets_present,mice_present,robber_bees_present,has_chalkbrood,has_twisted_larvae,material_Polystyrene,material_Wood,days,avg_daily_weight,avg_30_day_weight
0,528,2024-09-02,West Stacey,Michigan,1625.0,2024-09-02,11.961772,2024-12-31,17,13.876471,...,0,4,3,2,3,17,0,120,0.099681,2.990443
1,530,2024-06-17,North Christina,Idaho,1628.0,2024-06-17,23.082096,2024-11-12,21,21.947619,...,1,7,2,3,3,21,0,148,0.15596,4.678803
2,532,2023-09-19,Leachfort,Ohio,1631.0,2023-09-19,4.137755,2023-12-18,12,10.841667,...,0,6,5,0,2,0,0,90,0.045975,1.379252
3,532,2023-09-19,Leachfort,Ohio,1632.0,2024-01-01,8.544841,2024-04-26,16,9.34375,...,1,7,3,1,2,0,0,116,0.073662,2.209873
4,532,2023-09-19,Leachfort,Ohio,1633.0,2024-05-05,12.918446,2024-08-19,15,26.306667,...,0,4,2,3,2,0,0,106,0.121872,3.656164


Run Study and Save to Joblib

In [4]:
explanatory_variables=pull_explanatory_variables(df_aggregated)
joblib_loc = f'exp_study{uuid.uuid4().hex}.joblib'

model, test_results, importance_df = create_model(df_aggregated, explanatory_variables, joblib_loc)

results = test_results[['weight', 'avg_daily_weight', 'days', 'avg_predicted', 'predicted']].copy()

results

Unnamed: 0,weight,avg_daily_weight,days,avg_predicted,predicted
8,3.744086,0.035658,105,0.057929,6.082525
16,21.518998,0.128856,167,0.093025,15.535109
0,11.961772,0.099681,120,0.155183,18.621947
23,20.84129,0.16673,125,0.115106,14.388205
11,20.664264,0.138686,149,0.06776,10.096269
9,20.006495,0.148196,135,0.165206,22.302802
13,15.32567,0.107927,142,0.17147,24.348678
1,23.082096,0.15596,148,0.063347,9.375374


Save Joblib location to Database

In [5]:
with app.app_context():
    # Save metadata for new model to database
    new_study = ModelHistory(
        joblib_loc = joblib_loc
    )

    # Add the new study to the database and commit
    db.session.add(new_study)
    db.session.commit()

Update active model in Database

In [6]:
with app.app_context():
    prior_model = ModelHistory.query.filter(
        ModelHistory.end_date == None, 
        ModelHistory.start_date != None
    ).first()

    # Set the end date
    prior_model.end_date = date.today()

    current_model = ModelHistory.query.filter_by(
        joblib_loc=joblib_loc
    ).first()

    # Set the start and end dates
    current_model.start_date = date.today()
    current_model.end_date = None

    # Commit changes to the database
    db.session.commit()

In [9]:
joblib_loc = get_latest_joblib()
df_normalized, df_prediction_input = process_data_for_analysis(hives, actuals=False)
predicted_values = run_predictions(df_prediction_input, joblib_loc)
predictions_only = predicted_values[['hive_id', 'predicted']].set_index('hive_id')

prediction_dict = predictions_only.to_dict()
prediction_dict

TypeError: tuple indices must be integers or slices, not list

Testing

In [None]:
hives_mod = dclean.rename_ids(hives)
print('renamed ids...')

df_normalized = dclean.normalize_data(hives_mod)
print('df normalized...')

df_aggregated = dclean.aggregate_data(df_normalized)
print('df aggregated...')

json_normalized = df_normalized.to_dict(orient='list')
print('json normalized...')

json_aggregated = df_aggregated.to_dict(orient='list')
print('json aggregated...')

renamed ids...
df normalized...
df aggregated...
json normalized...
json aggregated...
