In [2]:
import pandas as pd
import tensorflow as tf                            # LSTM
import numpy as np                                 # Vectorial computation
import itertools                                   # Combinatory
import datetime
import pgeocode
import seaborn as sns

## Data reading

In [1]:
df = pd.read_csv('data/radon-data.csv')   # CSV reading
df.head()

NameError: name 'pd' is not defined

### Data cleaning

In [4]:
# Now we will parse the data, so it is usable
df.time = pd.to_datetime(df['time'], unit='s', origin='unix')      # date parse
df = df.drop(columns = ["time", "id", "sensor_id", "state_time"])  # drop useless columns
df.state = (df.state == "On").astype(int)                          # binarize state
df.head()

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
0,202,25,50,1015,0,0
1,258,25,51,1015,0,1
2,202,24,51,1015,0,0
3,182,24,51,1015,0,0
4,189,24,51,1015,0,0


In [34]:
# we now select the data from the summer (longest clean data)
df_summer = df.iloc[36000:48000]
df_summer.shape

(12000, 6)

### Data normalization

In [43]:
summer_min = df_summer.min() # needed to de-normalize data
summer_max = df_summer.max()

df_summer_normalized = (df_summer - summer_min) / (summer_max - summer_min)
df_summer_normalized.shape

(12000, 6)

## Linear Regression Model

In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from itertools import chain, combinations
reg = LinearRegression()

In [42]:
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store R-squared scores
r2_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)

    # Store the R-squared score in the dictionary
    r2_scores[combo] = r2

# Find the combination with the highest R-squared score
best_combo = max(r2_scores, key=r2_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_scores[best_combo])

Best combination of independent variables: ('temperature', 'humidity', 'pressure', 'state')
Highest R-squared score: 0.021471490002529126


In [23]:
covariate_list = ["state", "humidity", "pressure", "tvoc", "temperature"]

results_df = pd.DataFrame(columns=["Covariates", "R-squared", "Predictions", "Accuracy"])

# loop over different number of covariates
for number_of_covariates in range(1, 6):
    # loop over different combinations of covariates
    for covariate_combination in itertools.combinations(covariate_list, number_of_covariates):
        # select the covariate combination and the target variable
        labels = df_summer_normalized['radon']
        train1 = df_summer_normalized[list(covariate_combination)]

        # split the data into training and testing sets
        from sklearn.model_selection import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(train1, labels, test_size = 0.20, random_state = 2)

        # fit the linear regression model to the training data
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
        reg.fit(x_train, y_train)

        # evaluate the performance of the model on the testing data
        
        predictions = reg.predict(x_test)
        accuracy = reg.score(x_test, y_test)

        # store the covariate combination, the R-squared score, and the predicted values in the dataframe
        results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy, 
                                        "Predictions": predictions, "Accuracy": accuracy}, ignore_index=True)

# sort the results dataframe by R-squared score in descending order
results_df.sort_values("R-squared", ascending=False, inplace=True)

# select the row with the highest R-squared score
best_model = results_df.iloc[0]

# print the best model information
print("Best model covariates:", best_model["Covariates"])
print("Best model R-squared score:", best_model["R-squared"])
print("Best model predictions:", best_model["Predictions"])
print("Best model accuracy:", best_model["Accuracy"])



  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_co

Best model covariates: ('state', 'pressure', 'tvoc')
Best model R-squared score: 0.11719194665200394
Best model predictions: [0.06671249 0.06389529 0.06722831 ... 0.05444427 0.09683787 0.06137196]
Best model accuracy: 0.11719194665200394


  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,
  results_df = results_df.append({"Covariates": covariate_combination, "R-squared": accuracy,


In [19]:
results_df

Unnamed: 0,Covariates,R-squared,Predictions,Accuracy
0,"(state,)",0.08974,"[0.06444449250417335, 0.06444449250417335, 0.0...",0.08974
1,"(humidity,)",0.002068,"[0.08394026178824344, 0.0835251293702265, 0.08...",0.002068
2,"(pressure,)",0.019363,"[0.0839194038465462, 0.0839194038465462, 0.083...",0.019363
3,"(tvoc,)",0.007322,"[0.08489820186990166, 0.07967698163434679, 0.0...",0.007322
4,"(temperature,)",3.2e-05,"[0.08330054144114876, 0.08274105687784222, 0.0...",3.2e-05
5,"(state, humidity)",0.090604,"[0.06474635717971501, 0.0644308847802571, 0.06...",0.090604
6,"(state, pressure)",0.108232,"[0.06532544871844312, 0.06532544871844312, 0.0...",0.108232
7,"(state, tvoc)",0.089999,"[0.06488961050407038, 0.06387642608691062, 0.0...",0.089999
8,"(state, temperature)",0.089755,"[0.06430498805449492, 0.06401761198710848, 0.0...",0.089755
9,"(humidity, pressure)",0.019438,"[0.08393697521951732, 0.08391545590716801, 0.0...",0.019438


In [25]:
results_sorted = results.sort_values(by='RMSE')
print(results_sorted)

                                        Covariates  R-squared      RMSE  \
28            (state, pressure, tvoc, temperature)   0.110100  0.058131   
31  (state, humidity, pressure, tvoc, temperature)   0.110094  0.058131   
30  (state, humidity, pressure, tvoc, temperature)   0.110094  0.058131   
18                         (state, pressure, tvoc)   0.110080  0.058131   
25               (state, humidity, pressure, tvoc)   0.110071  0.058132   
19                  (state, pressure, temperature)   0.108300  0.058189   
26        (state, humidity, pressure, temperature)   0.108292  0.058190   
6                                (state, pressure)   0.108232  0.058192   
15                     (state, humidity, pressure)   0.108217  0.058192   
16                         (state, humidity, tvoc)   0.090922  0.058754   
27            (state, humidity, tvoc, temperature)   0.090918  0.058754   
5                                (state, humidity)   0.090604  0.058764   
17                  (stat

In [7]:
# We will use early stopping to select the best model
callback = tf.keras.callbacks.EarlyStopping(
    monitor              = 'val_loss',
    patience             = 30,
    restore_best_weights = True
)

# percentage of train samples
train_pct = 0.9

# We will store all the results in a data frame
results = pd.DataFrame(
    {i: [None]*15 for i in [1, 5, 10, 15, 25]}
)

cnt = 0 # helps counting

for number_of_covariates in range(4):
    for covariate_combination in itertools.combinations(("state", "humidity", "pressure", "tvoc"), 
                                                        number_of_covariates):
        print(f"EXECUTING OVER: {covariate_combination}".center(100, "*"))
        # We compute for each window size
        for window_size in [1, 5, 10, 15, 25]:
            print("\n")
            print(f"EXECUTING {window_size} WINDOW SIZE".center(100), "-")
            print("\n")
            windows = {i: generate_windows(df_summer_normalized[i], window_size, 1)
                        for i in ["radon"] + list(covariate_combination) }
            
            print("Windows computed, now building the model")
            
            data = mix_data(windows)
            N_total          = data.shape[0]
            test             = data[int(train_pct * N_total):, :, :]
            data             = data[:int(train_pct * N_total), :, :]
            
            model = tf.keras.models.Sequential(
                [
                    tf.keras.layers.InputLayer((window_size, len(windows.keys()))),
                    tf.keras.layers.LSTM(window_size * 2),
                    tf.keras.layers.Dense(16, activation = "relu"),
                    tf.keras.layers.Dense(16, activation = "relu"),
                    tf.keras.layers.Dense(1)
                ],
            )
            
            model.compile(
                loss = tf.keras.losses.MeanSquaredError(),
                metrics = tf.keras.metrics.RootMeanSquaredError(),
                optimizer = tf.keras.optimizers.RMSprop()
            )
            
            model.summary()
            
            model.fit(
                x                = data[:, :window_size, :],
                y                = data[:, window_size:, 0],
                epochs           = 1_000,
                shuffle          = False,
                validation_split = 0.2, 
                callbacks        = [callback,],
                verbose          = 0,
            )
            
            # Model trained
            
            # Now compute the test predictions and add them to the data frame
            rescale    = lambda x: x * (summer_max.radon - summer_min.radon) + summer_min.radon
            prediction = rescale( model.predict(test[:, :window_size, :]) )
            real       = rescale( test[:, window_size:, 0] )
            
            rmse_test = ( sum((prediction - real) ** 2) / len(prediction) ) ** (1/2)
            results[window_size][cnt] = rmse_test 
            print(f"Window {window_size}, covariates {covariate_combination}: {results[window_size][cnt]}")
        cnt += 1    
        print("*" * 100, "\n"*2)

*****************************************EXECUTING OVER: ()*****************************************


                                      EXECUTING 1 WINDOW SIZE                                        -


Windows computed, now building the model
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 2)                 32        
                                                                 
 dense (Dense)               (None, 16)                48        
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 369
Trainable params: 369
Non-trainable params: 0
_______

In [None]:
model.save("../output/radon.h5", save_format = "h5")

In [None]:
model.save("../output/radon.pb", save_format = "tf")

In [None]:
summer_min, summer_max

In [None]:
pd.DataFrame(
    {
        "predictions": prediction.flatten(),
        "real": real.flatten(),
    }
).to_csv("../data/prediction1s.csv", index = False)