In [1]:
import pandas as pd
import numpy as np


import matplotlib.patches as mpatches
import session_config
from session_config import  collect_survey_data, feature_variables, agg_groups
from reports import make_report_objects
from reports import histograms_standard
from reports import ecdf_plots_standard, scatter_plot_standard
# from reports import labels_for_display, 

from reports import report_meta_data
# import userdisplay
# import geospatial
import gridforecast as gfcast
# import datetime as dt
from IPython.display import Markdown
from scipy.stats import multinomial

import openai
from dotenv import load_dotenv
import os
from myst_nb import glue

from linearmethods import LinearMethods
from roughdraft import ReportTexts, messages_for_chat_completion, use_chat_completion

# load_dotenv()
# api_key = os.getenv('OPENAI_API_KEY')
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

datax = collect_survey_data()
refcodes = pd.read_csv('data/end_process/codes.csv').set_index('code')

# from use_cases example
ooi = ['G10',  'G30', 'G31', 'G33', 'G34', 'G35', 'G8', 'G7', 'G6', 'G5', 'G4', 'G37', 'G2', 'G27', 'G25', 'G26', 'G11']
# more refined search
tobo_snacks = ['G27', 'G30', 'G35']

fragmented = ['Gfrags', 'Gfoams']

steps = ['G144', 'G96', 'G95', 'G91', 'G97', 'G98', 'G100', 'G91', 'G99']

indus = ['G89', 'G67', 'G112', 'G93' , 'G66','G74', 'G72', 'G87', 'G65', 'G69', 'G68', 'G43', 'G41', 'G38', 'G36', 'G19', 'G17', 'Gfrags']

# features
land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'recreation', 'streets']

In [2]:
start, end = '2020-04-01', '2021-05-01'
cantons = ['Bern', 'Vaud', 'Genève', 'Valais', 'Zürich', 'Neuchâtel', 'Solothurn', 'Glarus', 'Schwyz']
lakes = ['lac-leman', 'neuenburgersee', 'zurichsee', 'bielersee', 'walensee', 'thunersee', 'brienzersee']
item = 1
name = cantons[item]

feature_name = None
feature_type = 'l' 
boundary = 'canton'
boundary_name =  cantons[item]
codes = steps
columns_of_interest = ['public-services', 'buildings', 'forest', 'undefined', 'vineyards', 'orchards', 'streets', 'recreation']

args = dict(
    data=datax,
    start=start,
    end=end,
    name=name,
    feature_name=feature_name,
    feature_type=feature_type,
    boundary=boundary,
    boundary_name=boundary_name,
    codes=codes
)
data_of_interest = report_meta_data(**args)
all_report, all_land_use = make_report_objects(data_of_interest['dataframe'], info_columns = ['canton', 'city', 'feature_name'])

linear_method_args = {
    'name': data_of_interest['meta']['name'],
    'start': data_of_interest['meta']['start'],
    'end':data_of_interest['meta']['end'],
    'survey_report': all_report,
    'landuse_report': all_land_use,
    'columns_of_interest': columns_of_interest,
    'report_meta': data_of_interest['meta']
   
}

regression_ml = LinearMethods(**linear_method_args)

model = 'gpt-4o-mini'
client = openai.OpenAI()

report_args = {
    'name': data_of_interest['meta']['name'],
    'start': data_of_interest['meta']['start'],
    'end':data_of_interest['meta']['end'],
    'survey_report': all_report,
    'landuse_report': all_land_use,
    'client': client,
    'report_meta': data_of_interest['meta'],
    'columns_of_interest': columns_of_interest
}

firstdraft = ReportTexts(**report_args)
firstdraft.chat = False
with open('rough_draft.md', 'w') as file:
    file.write(firstdraft.string_rep(datax))

['public-services', 'buildings', 'forest', 'undefined', 'vineyards', 'orchards', 'streets', 'recreation']
['public-services', 'buildings', 'forest', 'undefined', 'vineyards', 'orchards', 'streets', 'recreation']


In [3]:
client = ChatOpenAI(model='gpt-4')
with open('rough_draft.md', 'r') as file:
    mdf = file.read()

def make_system_prompt(roughdraft, language='English'):
    system_prompt = (
    "You are a researcher assigned the task of preparing a manuscript from a rough draft. "   
    "You are tasked to produce an executive summary, and answer specific questions "
    "Do not give your opinion. Do not use phrases like 'this report provides' or 'this study correlates' or 'according to the text' or 'this document' or anything like that."
    "State what the subject is, Answer the question completely and check your answers, use an authoritive voice."
    "You must give numerical examples from the report when you answer a question about the survey results"
    "You must reference the section where the response comes from"        
    "there are instructions in the document for analysis the instructions are labeled <!--- INSTRUCTION_START "
    f"All answers should be in {language} and in paragraph form. Here is the document:\n\n" 
    f"{roughdraft}"
    )
    return system_prompt
    

def make_summary_prompt(language):
        
    user_prompt = f"""\n
        
    Provide a summary of the following sections: Administrative boundaries, Named features, Summary statistics, Municipal results and material composition.
    
    The following must be included in the executive summary:

    * The Name and number of the cities, lakes, rivers or parks in the report : Instructions: If the canton is provided start with 'The survey report considers the litter density in <name the canton>', then name the cities, then the lakes, rivers and parks.
    * The start and end date of the sampling
    * The numbar of samples, the average pcs/m, the maximum pcs/m and the total number of objects identified
    * The material composition
    * The five most common items, consider the fail-rate in the inventory table, cite the fail rate and the percent of total for these
    * The table of values of municipal results
   
    
    Answer the following questions:
    
        * Given the report results, what are the three cities with the highest average pcs/m ? 
        * Given the report reuslts, what city had the lowest average pcs/m ?
      
    Reply in {language}

    formatting instructions:

    1. Lable the summary 'Sample results' (##)
    2. The label for the questions section is 'Frequently asked questions' use markdown formatting for the label (###)
    3. Repeat the question (in bold) DO NOT REPEAT the instructions. and then answer
    """
    return user_prompt    

messages = messages_for_chat_completion(system_prompt=make_system_prompt(mdf), user_prompt=make_summary_prompt('English'))
completed_chat = client.invoke(messages)
summary = completed_chat.content

In [4]:
# grid forecast analysis
request = (
    "Summarize the grid forecast section. Include an explanation of the method and different assumptions of the priors"
)


insone = (
    "<!-- INSTRUCTION_START Consider whether the observed data is likely to be normally distributed (use the difference between the median and mean in summary statistics section). "
    "Do you have examples of other domains where gird approximations or bayesina techniques are used ? specifically in the domains of ecology, environmental sciences and biodiversity."
    " INSTRUCTION_END -->\n"
)

insthree = (
        "<!-- INSTRUCTION_START Cite the name of the prior and the expected average and median pcs/m. INSTRUCTION_END -->\n"
    
)

insfour = (
    "<!-- INSTRUCTION_START Consider that the prior is comprised only of locations that are in the same geographic boundary. "
    "Recall that the posterior is a weighted average of the prior and likelihood, so if the in boundary prior predicts an increase "
    "it is likely that elevated values were observed in other locations of the region "
    " INSTRUCTION_END -->\n"
)

insfour = (
    "<!-- INSTRUCTION_START Consider that the prior is comprised only of locations that are outside the geographic boundary. "
    "Recall that the posterior is a weighted average of the prior and likelihood, so if the out boundary prior predicts an increase "
    "it is likely that locations outside of the region had elevated values "
    " INSTRUCTION_END -->\n"
)

inssix = (
    "<!-- INSTRUCTION_START Consider the average pcs/m result of each prior in relation to the observed average in pcs/m. "
    "Cite the numerica differences, given the standard deviation how likely is a person to notice the increase or decrease? " 
    " INSTRUCTION_END -->\n"
)
questions = (
    f"1. Why is grid approximation a reasonable modeling technique given the data ?{insone}"
    "2. What is the difference between grid approximation and linear or enemble regression ?"
    f"3. Under what prior do we expect to find the most ? The least ?{insthree}"
    f"4. If the in-boundary grid approxmation predicts an increase or decrease, what does that say about the other samples from within the boundary ?{insfour}"
    "5. If the out-boundary grid approxmation predicts an increase or decrease, what does that say about the other samples from within the boundary ?\n"
    f"6. How different are the expected results from the observed results ?{inssix}"
    
    
)

formatting_instructions = (
    "1. Label the summary 'Forecasts and methods' (##)\n"
    "2. The label for the questions section is 'Frequently asked questions' (###) \n"
    "3. Repeat the question (in bold) DO NOT REPEAT the instructions. and then answer\n"
)

def make_grid_approximation_prompt(language):
        
    user_prompt = (
        
        f'{request}\n\n'
        f'{questions}\n\n'
        f'Reply in {language}\n\n'
        f'{formatting_instructions}'
)



    return user_prompt    

In [5]:
messagesx = messages_for_chat_completion(system_prompt=make_system_prompt(mdf), user_prompt=make_grid_approximation_prompt('English'))
completed_chatx = client.invoke(messagesx)
grid_forecast = completed_chatx.content

In [6]:

request = (
    "Define cluster analysis (kmeans), linear regression and ensemble methods, explain the basic assumptions of each method"
)

req_inst = (
     "<!-- INSTRUCTION_START do not quote the regression results here. Expalin the methods compeletely INSTRUCTION_END -->\n"
    
)


insone = (
    "<!-- INSTRUCTION_START The document provides a table of results. Provide that table to user and write a narrative paragraph of all the results"
    " INSTRUCTION_END -->\n"
)

# insthree = (
#         "<!-- INSTRUCTION_START Cite the name of the prior and the expected average and median pcs/m. INSTRUCTION_END -->\n"
    
# )

# insfour = (
#     "<!-- INSTRUCTION_START Consider that the prior is comprised only of locations that are in the same geographic boundary. "
#     "Recall that the posterior is a weighted average of the prior and likelihood, so if the in boundary prior predicts an increase "
#     "it is likely that elevated values were observed in other locations of the region "
#     " INSTRUCTION_END -->\n"
# )

insfour = (
    "<!-- INSTRUCTION_START The average pcs/m is given as a table in the cluster analysis subsection and given as objects per meter. "
    "The distribution of land use values is given in the cluster analysis subsection and given as a float value that represents "
    "the average proportion of the buffer zone occupied by the land use category. The paragraph above the table explains how to interpret the table"
    " when you provide the results for the cluster lable the results as % of buffer occupied by land use feature INSTRUCTION_END -->\n"
)

# inssix = (
#     "<!-- INSTRUCTION_START Consider the average pcs/m result of each prior in relation to the observed average in pcs/m. "
#     "Cite the numerica differences, given the standard deviation how likely is a person to notice the increase or decrease? " 
#     " INSTRUCTION_END -->\n"
# )
questions = (
    f"1. What were the r² and MSE of each test ? {insone}"
    f"2. Given the r² and MSE of the different methods employed, how reliable do you think predictions would be based on these models ?\n"
    f"3. Can any conlusions be drawn from these results ?\n"
    f"4. Accroding to the cluster analysis what is the cluster that has the greatest average pcs/m ? What is the distribution of land use values within the cluster ? {insfour}"
    # "5. If the out-boundary grid approxmation predicts an increase or decrease, what does that say about the other samples from within the boundary ?\n"
    # f"6. How different are the expected results from the observed results ?{inssix}"
    
    
)

formatting_instructions = (
    "1. Label the summary 'Linear and ensemble methods' (##)\n"
    "2. The label for the questions section is 'Frequently asked questions' (###) \n"
    "3. Repeat the question (in bold) DO NOT REPEAT the instructions. and then answer\n"
)

def make_regression_prompt(language):
        
    user_prompt = (
        
        f'{request}{req_inst}\n\n'
        f'{questions}\n\n'
        f'Reply in {language}\n\n'
        f'{formatting_instructions}'
    )



    return user_prompt    

In [7]:
messagesxx = messages_for_chat_completion(system_prompt=make_system_prompt(mdf), user_prompt=make_regression_prompt('English'))
completed_chatxx = client.invoke(messagesxx)
linear_methods = completed_chatxx.content

In [14]:
request = (
    "Define sampling stratification and land-use"
)


# insone = (
#     "<!-- INSTRUCTION_START The document provides a table of results. Provide that table to user and write a narrative paragraph of all the results"
#     " INSTRUCTION_END -->\n"
# )

# insfour = (
#     "<!-- INSTRUCTION_START The average pcs/m is given as a table in the cluster analysis subsection and given as objects per meter. "
#     "The distribution of land use values is given in the cluster analysis subsection and given as a float value that represents the amount of "
#     "the average amount of the buffer zone occupied by the land use category. The paragraph above the table explains how to interpret the table"
#     " INSTRUCTION_END -->\n"
# )

# inssix = (
#     "<!-- INSTRUCTION_START Consider the average pcs/m result of each prior in relation to the observed average in pcs/m. "
#     "Cite the numerica differences, given the standard deviation how likely is a person to notice the increase or decrease? " 
#     " INSTRUCTION_END -->\n"
# )
questions = (
    f"1. How do I interpret the results in the sampling stratification table ?"
    f"2. How do I interpret the results in the sampling stratification and trash density table ?\n"
    # f"3. Can any conlusions be drawn from these results ?\n"
    # f"4. Accroding to the cluster analysis what is the cluster that has the greatest average pcs/m ? What is the distribution of land use values within the cluster ? {insfour}"
    # "5. If the out-boundary grid approxmation predicts an increase or decrease, what does that say about the other samples from within the boundary ?\n"
    # f"6. How different are the expected results from the observed results ?{inssix}"
    
    
)

formatting_instructions = (
    "1. Label the summary 'Sampling stratification' (##)\n"
    "2. The label for the questions section is 'Frequently asked questions' (###) \n"
    "3. Repeat the question (in bold) DO NOT REPEAT the instructions. and then answer\n"
)

def make_stratification_prompt(language):
        
    user_prompt = (
        
        f'{request}\n\n'
        f'{questions}\n\n'
        # f'Reply in {language}\n\n'
        f'{formatting_instructions}'
    )



    return user_prompt    

In [15]:
messagesxxx = messages_for_chat_completion(system_prompt=make_system_prompt(mdf), user_prompt=make_stratification_prompt('English'))
completed_chatxxx = client.invoke(messagesxxx)
sampling_stratification = completed_chatxxx.content

In [16]:
Markdown(sampling_stratification)

## Sampling Stratification

Sampling stratification is a method of dividing the total sample space into non-overlapping subgroups, or strata. This is done to ensure that the sample drawn is representative of the entire population. In the context of this report, sampling stratification refers to the distribution of various land-use features in a defined buffer zone around a survey location. Land-use features might include categories such as buildings, forests, wetlands, streets, public services, recreation, vineyards, orchards, and undefined areas.

### Frequently Asked Questions

**How do I interpret the results in the sampling stratification table?**

The sampling stratification table provides information on the proportion of samples collected from different land-use features within the buffer zone around a survey location. Each row in the table represents a range of proportions (0-20%, 20-40%, etc.) of the buffer zone dedicated to a specific land-use feature. The values in each row indicate the percentage of samples collected from areas with that particular proportion of the feature. For example, a value of 6.10% in the 0-20% row under 'buildings' means that 6.10% of the samples were collected from areas where buildings occupied 0-20% of the buffer zone.

**How do I interpret the results in the sampling stratification and trash density table?**

The sampling stratification and trash density table shows how the observed litter density changes based on the land-use feature and the proportion of the buffer zone that the feature occupies. Each row represents a range of proportions of the buffer zone dedicated to a specific land-use feature. The values in each row indicate the average litter density observed in areas with that particular proportion of the feature. For example, a value of 0.678 in the 0-20% row under 'buildings' means that the average litter density was 0.678 in areas where buildings occupied 0-20% of the buffer zone.

In [None]:
a_summary = (
    f'{summary}\n'
    f'{grid_forecast}\n'
    f'{linear_methods}\n'
)

Markdown(a_summary)