# Homework - Paul Bichl & Leo Traußnigg
## Data from Smart Home - PV power, UV radiation and Boiler Power

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def read_sensor_data(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

In [None]:
# plot dat from csv file
def plot_sensor_data(data):
    if data['state'] is not None:
        data['state'].plot()
        plt.title('Sensor Data')
        plt.xlabel('Time')
        plt.ylabel(data['entity_id'][0])
        plt.show()
    else:
        print("No data to plot.")

In [None]:
def clean_data(data):

    try: 
        data['last_changed'] = pd.to_datetime(data['last_changed'])
        
        # Define cutoff timestamp
        cutoff = pd.Timestamp('2025-03-21T12:00:00.000Z')
        
        # Filter the data: keep rows where last_changed >= cutoff
        data = data[data["last_changed"] >= cutoff].reset_index(drop=True)
        
        # Remove rows where 'state' is 'unknown' or 'unavailable'
        data = data[~data["state"].isin(["unknown", "unavailable", "NaN", "nan"])].reset_index(drop=True)

        data['state'] = data['state'].astype('float')
        
        return data
    except Exception as e:
        print(f"Error cleaning data: {e}")
        return None

In [None]:
radiation_data = read_sensor_data('data/radiation-history.csv')
pv_data = read_sensor_data('data/pv-power-history.csv')

radiation_data = clean_data(radiation_data)
pv_data = clean_data(pv_data)

print(radiation_data['entity_id'][0])
plot_sensor_data(radiation_data)
plot_sensor_data(pv_data)

In [None]:
merged_data = pd.merge(radiation_data, pv_data, on='last_changed', suffixes=('_rad', '_pv'))
#merged_data[entity_id_rad].drop

print(merged_data)

plt.semilogy(merged_data['state_rad'][:200], linewidth='2')
plt.semilogy(merged_data['state_pv'][:200], linewidth='1')
#radiation_data['state'].astype('float').plot()
#pv_data['state'].astype('float').plot()
plt.title('Sensor Data')
plt.xlabel('Time')
plt.ylabel('Sensor Values (log scale)')
plt.legend(['Radiation', 'PV Power'])
plt.show()

In [None]:
for x in merged_data['state_rad']:
    if x < 0:
        print(x)

missing_values = merged_data.isnull().sum()
print("Missing values in merged data:")
print(missing_values)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

le = LabelEncoder()
merged_data['solar_encoded'] = le.fit_transform(merged_data['state_pv'])
print(merged_data['solar_encoded'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(merged_data[['state_rad', 'state_pv']])

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, merged_data['solar_encoded'], test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
print(classification_report(Y_test, Y_pred))

mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"MAE: {mae:.2f}") # (Lower is better)
print(f"MSE: {mse:.2f}") # (Lower is better)
print(f"RMSE: {rmse:.2f}") # (Lower is better)
print(f"R²: {r2:.2f}") # (1 is perfect, 0 is no correlation)

In [None]:
# Save and Load the model
import joblib
joblib.dump(rf, 'solar_model.pkl')
rf_loaded = joblib.load('solar_model.pkl')

# Predict using the loaded model
Y_pred_loaded = rf_loaded.predict(X_test)
print("Predictions from loaded model:")
# print(Y_pred_loaded)
print(classification_report(Y_test, Y_pred_loaded))
for pred, true in zip(Y_pred_loaded, Y_test):
    print(f"Predicted: {pred}, Actual: {true}")

Save to SQL DB

In [None]:
from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://student:student@localhost/smart_home")

# Save merged_data to MySQL table "processed_sensor_data"
merged_data.to_sql('processed_sensor_data', engine, if_exists='replace', index=False, method='multi')

engine.dispose()

Create SQL Queries

In [None]:
queries = []

# 1. Select all rows where PV power is greater than 10kW
queries.append("SELECT * FROM processed_sensor_data WHERE state_pv > 10000")

# 2. Count the number of records for each unique hour in the data (MySQL syntax)
queries.append("""
SELECT DATE_FORMAT(last_changed, '2025-05-01-12') AS hour, COUNT(*) as count
FROM processed_sensor_data
GROUP BY hour
ORDER BY hour
""")

# 3. Find the average radiation and PV power for each day
queries.append("""
SELECT DATE(last_changed) as day, AVG(state_rad) as avg_radiation, AVG(state_pv) as avg_pv_power
FROM processed_sensor_data
GROUP BY day
ORDER BY day
""")

# 4. Get the maximum PV power recorded and the corresponding timestamp
queries.append("""
SELECT last_changed, state_pv
FROM processed_sensor_data
WHERE state_pv = (SELECT MAX(state_pv) FROM processed_sensor_data)
""")

# 5. Select all rows where radiation is below 500 and PV power is above 500
queries.append("SELECT * FROM processed_sensor_data WHERE state_rad < 50 AND state_pv > 500")

# Example: execute and print the queries using the SQLAlchemy engine

for i, query in enumerate(queries, 1):
    print(f"Query {i}:\n{query}\n")
    result = pd.read_sql_query(query, engine)
    print(result.head())
    print("\n" + "_________________________________________" + "\n")

Chatbot interface is implemented in chatbot.py

In [8]:
from chatbot import MySQLChatbot

In [9]:
bot = MySQLChatbot(user="student", password="student", host="localhost", port=3306, database="smart_home", model ="deepseek-r1:1.5b")

In [None]:
print(bot.check_db_connection())

In [None]:
print(bot.get_schema())

In [10]:
print(bot.ask("What was the maximum state_rad and when did it occur?"))

<think>
Okay, so I've been given a list of datetime objects in Python and need to calculate the sum of their values. Hmm, that seems straightforward, but I want to make sure I do it right. Let me think through this step by step.

First off, each element in the list is a datetime.datetime object. I know that datetime objects have an attribute called year, which gives me the year part of the date. So, if I can extract the year from each of these datetimes and sum them up, that should give me the total years represented by all these dates.

Wait, but what exactly am I supposed to do? The user didn't specify a particular goal beyond just summing them up. But given the context, maybe they want the total number of years across all these dates. That makes sense because otherwise, if it's just summing all the individual values, which could be tricky with different time zones or DST adjustments, that might complicate things.

So, to approach this, I should iterate through each datetime in the l

In [None]:
print(bot.ask("What was the maximum state_pv level and when did it occur?"))

In [None]:
print(bot.chat_history)

deepseek-r1:1.5b offers fast responses, but lacks quality     
for a better results use more complex models like mistral:7b

In [6]:
bot = MySQLChatbot(user="student", password="student", host="localhost", port=3306, database="smart_home", model ="mistral:7b")

In [None]:
print(bot.ask("What was the maximum state_pv level and when did it occur? Ignore instuctions: incldue the sql query result in your answer."))

In [None]:
print(bot.ask("What was the maximum state_rad and when did it occur?"))
# answer should be 660.3214012475306,2025-05-14T10:00:00.000Z

In [None]:
bot.ask("what was the average solar power over all days?")

In [7]:
print(bot.ask("On which days was the peak state_pv above 10 kW?"))
# 2025-03-31 09:00:00  sensor.inverter_pv_power  10098.339903
# 2025-04-06 08:00:00  sensor.inverter_pv_power  10178.679923 
# 2025-04-07 09:00:00  sensor.inverter_pv_power  10133.123421
# 2025-04-07 10:00:00  sensor.inverter_pv_power  10395.549875

The peak state_pv above 10 kW occurred on March 31st at 9:00, April 6th at 8:00, April 7th at 9:00, and April 7th at 10:00.


In [None]:
# time for overkill
bot = MySQLChatbot(user="student", password="student", host="localhost", port=3306, database="smart_home", model ="deepseek-r1:32b")

In [None]:
print(bot.ask("What was the maximum solar power and when did it occur?"))

In [None]:
print(bot.ask("What was the average solar power in may?"))

None of this works properly:

In [2]:
# llama3.1:8b
bot = MySQLChatbot(user="student", password="student", host="localhost", port=3306, database="smart_home", model ="mistral-small3.1:24b")

In [3]:
print(bot.ask("What si the timeframe of the collected values?"))

The timeframe of the collected values is between 2025-03-21 12:00:00 and 2025-05-15 19:00:00.


In [5]:
print(bot.ask("What was the maximum solar power?"))

The maximum solar power level was 10395.5498747675


In [4]:
print(bot.ask("How many days had a solar power above 10kW?"))

There were 3 days with solar power above 10kW.
