In [33]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pandas as pd
import datetime

# Firebase project's credentials file
cred = credentials.Certificate("service.json")
if not firebase_admin._apps:
    firebase_admin.initialize_app(cred)
else:
    firebase_admin.get_app()

db = firestore.client()


*Iceberg query*

In [34]:
'''the results to only include documents from the past 5 years, 
    orders the results by MaxTemp in descending order, 
    and returns only the top 10 results.'''

# query = db.collection('weather_data') \
#            .where('Date', '>=', '2016-04-02') \
#            .order_by('MaxTemp', direction=firestore.Query.DESCENDING) \
#            .order_by('Date') \
#            .limit(10)


# results = query.stream()

# for doc in results:
#     print(f'{doc.id} => {doc.to_dict()}')

start_date = datetime.datetime(2012, 1, 1)
end_date = datetime.datetime(2023, 4, 2)

query = db.collection('weather_data') \
           .order_by('MaxTemp', direction=firestore.Query.DESCENDING) \
           .limit(5)

results = query.stream()

for doc in results:
    doc_date = doc.to_dict()['Date']
    date_obj = datetime.datetime.strptime(doc_date, '%Y-%m-%d')
    if start_date <= date_obj <= end_date:
        print(f'{doc.id} : {doc.to_dict()}')


12611 : {'AvgPressure': 1001.0, 'WindSpeed9am': 13.0, 'RainToday': 'No', 'Temp3pm': 46.7, 'Pressure3pm': 998.1, 'Humidity9am': 30.0, 'WindGustDir': 'SSW', 'WindDir9am': 'NNE', 'Temp9am': 32.9, 'MinTemp': 27.8, 'Rainfall': 0.0, 'AvgTemp': 37.55, 'MaxTemp': 47.3, 'Location': 'Moree', 'AvgRainfall': 0.0, 'RainTomorrow': 'No', 'WindSpeed3pm': 28.0, 'Date': '2017-02-12', 'WindDir3pm': 'NW', 'WindGustSpeed': 67.0, 'Pressure9am': 1003.9, 'AvgHumidity': 20.0, 'Humidity3pm': 10.0, 'AvgWind': 36.0}
11732 : {'AvgPressure': 998.9, 'WindSpeed9am': 33.0, 'RainToday': 'No', 'Temp3pm': 45.8, 'Pressure3pm': 995.9, 'Humidity9am': 36.0, 'WindGustDir': 'WSW', 'WindDir9am': 'N', 'Temp9am': 33.4, 'MinTemp': 28.3, 'Rainfall': 0.0, 'AvgTemp': 37.8, 'MaxTemp': 47.3, 'Location': 'Moree', 'AvgRainfall': 0.0, 'RainTomorrow': 'No', 'WindSpeed3pm': 33.0, 'Date': '2014-01-03', 'WindDir3pm': 'WNW', 'WindGustSpeed': 61.0, 'Pressure9am': 1001.9, 'AvgHumidity': 20.5, 'Humidity3pm': 5.0, 'AvgWind': 42.333333333333336}
74

windowing query

In [38]:
'''calculates the rolling average maximum temperature 
    over a 7-day window for each location'''

# Query all documents in the "weather_data" collection
docs = db.collection_group("weather_data").get()

# Convert the documents to a pandas DataFrame
data = []
for doc in docs:
    doc_data = doc.to_dict()
    doc_data["id"] = doc.id
    data.append(doc_data)
df = pd.DataFrame(data)

# Convert the "Date" column to datetime format
df["Date"] = pd.to_datetime(df["Date"])

# Calculate the rolling average maximum temperature over a 7-day window for each location
rolling_avg = df.groupby("Location")["MaxTemp"].rolling(window=pd.Timedelta("7D").days, min_periods=1).mean()
df["RollingAvgMaxTemp"] = rolling_avg.reset_index(0, drop=True)


# Filter for the most recent 5 years of data since the dataset goes up to 2017, we will start in 2012
recent_data = df[df["Date"] > pd.Timestamp(year=2012, month=1, day=1)]

# Get the days with the highest rolling average maximum temperature for each location
max_temp_days = recent_data.groupby("Location")["RollingAvgMaxTemp"].idxmax()
result = recent_data.loc[max_temp_days, ["Location", "Date", "RollingAvgMaxTemp"]]

print(result)

RetryError: Deadline of 300.0s exceeded while calling target function, last exception: 429 Quota exceeded.

using window close

In [None]:
'''retrieve the previous and next year's maximum temperature for each location'''

# Define the collection name and field names
collection_name = "weather_data"
date_field = "Date"
location_field = "Location"
max_temp_field = "MaxTemp"

# Define the date range for the query (most recent 5 years)
start_date = "2012-01-01"
end_date = "2021-12-31"

# Query the Firestore collection for the relevant data
query = db.collection(collection_name) \
           .where(date_field, ">=", start_date) \
           .where(date_field, "<=", end_date) \
           .order_by(date_field, location_field)

# Initialize variables to store the previous and next year's maximum temperatures
prev_max_temps = {}
next_max_temps = {}

# Iterate over the query results and populate the prev_max_temps and next_max_temps dictionaries
for doc in query.stream():
    # Extract the document data
    doc_data = doc.to_dict()
    date = doc_data[date_field]
    location = doc_data[location_field]
    max_temp = doc_data[max_temp_field]

    # Check if there is a previous year's maximum temperature for this location
    prev_date = f"{int(date[:4])-1}{date[4:]}"
    prev_query = db.collection(collection_name) \
                   .where(date_field, "==", prev_date) \
                   .where(location_field, "==", location) \
                   .limit(1)
    prev_doc = next(iter(prev_query.stream()), None)
    if prev_doc:
        prev_max_temp = prev_doc.to_dict()[max_temp_field]
        prev_max_temps[f"{location}_{date}"] = prev_max_temp

    # Check if there is a next year's maximum temperature for this location
    next_date = f"{int(date[:4])+1}{date[4:]}"
    next_query = db.collection(collection_name) \
                   .where(date_field, "==", next_date) \
                   .where(location_field, "==", location) \
                   .limit(1)
    next_doc = next(iter(next_query.stream()), None)
    if next_doc:
        next_max_temp = next_doc.to_dict()[max_temp_field]
        next_max_temps[f"{location}_{date}"] = next_max_temp

# Filter the data to include only the most recent year for each location
most_recent_year = end_date[:4]
filtered_data = [doc.to_dict() for doc in query.stream()
                 if doc.to_dict()[date_field].startswith(most_recent_year)]

# Add the previous and next year's maximum temperatures to the filtered data
for doc in filtered_data:
    key = f"{doc[location_field]}_{doc[date_field]}"
    doc["PrevMaxTemp"] = prev_max_temps.get(key, None)
    doc["NextMaxTemp"] = next_max_temps.get(key, None)

# Print the filtered data with the previous and next year's maximum temperatures
for doc in filtered_data:
    print(doc)


drill down

In [None]:
'''retrieves all weather data documents from the "weather_data" collection 
    where the location is "Sydney" and the maximum temperature is greater than 
    or equal to 30'''

# Define the collection and query
collection_name = "weather_data"
query = db.collection(collection_name).where("Location", "==", "Sydney").where("MaxTemp", ">=", 30)

# Retrieve the documents that match the query
results = query.get()

# Print the document IDs and data
for doc in results:
    print(f"Document ID: {doc.id}")
    print(f"Data: {doc.to_dict()}")

roll up

In [None]:
'''retrieves the average maximum temperature 
    for each location in the "weather_data" collection'''
# Define the collection and query
collection_name = "weather_data"
query = db.collection(collection_name).select(["Location", "MaxTemp"]).order_by("Location")

# Retrieve the documents and compute the summary
results = query.stream()
summaries = {}
counts = {}
for doc in results:
    location = doc.get("Location")
    max_temp = doc.get("MaxTemp")
    if location not in summaries:
        summaries[location] = max_temp
        counts[location] = 1
    else:
        summaries[location] += max_temp
        counts[location] += 1

# Print the summary
print("Location\tAvg Max Temp")
for location, summary in summaries.items():
    avg_max_temp = summary / counts[location]
    print(f"{location}\t{avg_max_temp:.2f}")

slice

In [None]:
'''selects documents in the "weather_data" collection 
    where the temperature (which is the combination of min, max temp at 3am and temps at 9am) 
    is greater than 30 degrees'''
# Create a Firestore client
db = firestore.client()

# Define the collection and query
collection_name = "weather_data"
query = db.collection(collection_name).where("MinTemp", ">=", 30).where("MaxTemp", ">=", 30).where("Temp3pm", ">=", 30).where("Temp9am", ">=", 30)

# Retrieve the documents
results = query.stream()

# Print the documents
for doc in results:
    print(doc.to_dict())

dice

In [None]:
'''selects documents in the "weather_data" collection where the location is "Albury" 
    and the minimum temperature is less than or equal to 10 degrees'''
# Define the collection and query
collection_name = "weather_data"
query = db.collection(collection_name).where("Location", "==", "Albury").where("MinTemp", "<=", 10)

# Retrieve the documents
results = query.stream()

# Print the documents
for doc in results:
    print(doc.to_dict())

combination of olap

In [None]:
'''selects documents in the "weather_data" collection 
    where the location is either "Albury" or "Melbourne", 
    and the minimum temperature is less than or equal to 10 degrees'''
# Define the collection and query
collection_name = "weather_data"
query = db.collection(collection_name).where("Location", "in", ["Albury", "Melbourne"]).where("MinTemp", "<=", 10)

# Retrieve the documents
results = query.stream()

# Print the documents
for doc in results:
    print(doc.to_dict())