In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/radon-data.csv')

In [3]:
df

Unnamed: 0,id,time,radon,temperature,humidity,pressure,tvoc,sensor_id,state,state_time
0,21906,1569405062,202,25,50,1015,0,2,Off,1569404979
1,21907,1569405663,258,25,51,1015,0,2,On,1569405215
2,21908,1569406264,202,24,51,1015,0,2,Off,1569405671
3,21909,1569406865,182,24,51,1015,0,2,Off,1569406848
4,21910,1569407466,189,24,51,1015,0,2,Off,1569406866
...,...,...,...,...,...,...,...,...,...,...
87965,138640,1622736862,1344,22,43,1018,23,2,Off,1622736447
87966,138642,1622737462,1293,22,43,1018,34,2,Off,1622736963
87967,138644,1622738063,1223,22,43,1018,34,2,Off,1622736963
87968,138646,1622738663,1171,22,43,1018,34,2,Off,1622738463


# Linear Regression: Data Cleaning 

In [4]:
df = df.drop(columns=["time", "state_time", "id", "sensor_id"])
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
0,202,25,50,1015,0,Off
1,258,25,51,1015,0,On
2,202,24,51,1015,0,Off
3,182,24,51,1015,0,Off
4,189,24,51,1015,0,Off
...,...,...,...,...,...,...
87965,1344,22,43,1018,23,Off
87966,1293,22,43,1018,34,Off
87967,1223,22,43,1018,34,Off
87968,1171,22,43,1018,34,Off


In [5]:
df['state'] = df['state'].replace({'Off': 0, 'On': 1})

In [6]:
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
0,202,25,50,1015,0,0
1,258,25,51,1015,0,1
2,202,24,51,1015,0,0
3,182,24,51,1015,0,0
4,189,24,51,1015,0,0
...,...,...,...,...,...,...
87965,1344,22,43,1018,23,0
87966,1293,22,43,1018,34,0
87967,1223,22,43,1018,34,0
87968,1171,22,43,1018,34,0


In [7]:
df_summer = df.iloc[36000:48000]
df_summer.head()

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
36000,388,24,57,1010,4,1
36001,383,24,57,1010,2,1
36002,398,24,57,1010,6,1
36003,388,24,57,1010,8,1
36004,388,24,57,1010,10,1


In [8]:
summer_min = df_summer.min() # needed to de-normalize data
summer_max = df_summer.max()

df_summer_normalized = (df_summer - summer_min) / (summer_max - summer_min)
df_summer_normalized.head()

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
36000,0.11964,0.714286,0.666667,0.5,0.003463,1.0
36001,0.117911,0.714286,0.666667,0.5,0.001732,1.0
36002,0.123098,0.714286,0.666667,0.5,0.005195,1.0
36003,0.11964,0.714286,0.666667,0.5,0.006926,1.0
36004,0.11964,0.714286,0.666667,0.5,0.008658,1.0


In [9]:
df = df_summer_normalized

In [10]:
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
36000,0.119640,0.714286,0.666667,0.500000,0.003463,1.0
36001,0.117911,0.714286,0.666667,0.500000,0.001732,1.0
36002,0.123098,0.714286,0.666667,0.500000,0.005195,1.0
36003,0.119640,0.714286,0.666667,0.500000,0.006926,1.0
36004,0.119640,0.714286,0.666667,0.500000,0.008658,1.0
...,...,...,...,...,...,...
47995,0.044952,1.000000,0.481481,0.541667,0.054545,0.0
47996,0.043568,1.000000,0.481481,0.541667,0.056277,0.0
47997,0.055325,1.000000,0.481481,0.541667,0.053680,0.0
47998,0.055325,1.000000,0.481481,0.541667,0.046753,1.0


# Linear Regression: Model Train/Test

In [16]:
from itertools import chain, combinations
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [17]:
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store R-squared scores
r2_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)

    # Store the R-squared score in the dictionary
    r2_scores[combo] = r2

# Find the combination with the highest R-squared score
best_combo = max(r2_scores, key=r2_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_scores[best_combo])

Best combination of independent variables: ('pressure', 'tvoc', 'state')
Highest R-squared score: 0.17172401496142042


In [13]:
r2_scores

{('temperature',): -0.002269432245948666,
 ('humidity',): -0.0029154305652669965,
 ('pressure',): 0.007308262011364808,
 ('tvoc',): 0.011762459451419183,
 ('state',): 0.1613618682246124,
 ('temperature', 'humidity'): -0.0026452620689232376,
 ('temperature', 'pressure'): 0.007173090687701222,
 ('temperature', 'tvoc'): 0.012580998385266606,
 ('temperature', 'state'): 0.16151315701031,
 ('humidity', 'pressure'): 0.007295791294366749,
 ('humidity', 'tvoc'): 0.011826980645785024,
 ('humidity', 'state'): 0.16089070662159166,
 ('pressure', 'tvoc'): 0.02968620046207482,
 ('pressure', 'state'): 0.16775418842762935,
 ('tvoc', 'state'): 0.16229821355941,
 ('temperature', 'humidity', 'pressure'): 0.007142904515985649,
 ('temperature', 'humidity', 'tvoc'): 0.012485597642468593,
 ('temperature', 'humidity', 'state'): 0.1609516549825094,
 ('temperature', 'pressure', 'tvoc'): 0.02981035115877162,
 ('temperature', 'pressure', 'state'): 0.16751704428511127,
 ('temperature', 'tvoc', 'state'): 0.162527010

# Logistic Regression: Data Cleaning

In [14]:
df_logistic_reg = pd.read_csv('../data/radon-data.csv')
df_logistic_reg = df_logistic_reg.drop(columns=["time", "state_time", "id", "sensor_id"])
df_logistic_reg['state'] = df_logistic_reg['state'].replace({'Off': 0, 'On': 1})
df_logistic_reg.head()


FileNotFoundError: [Errno 2] No such file or directory: '../data/radon-data.csv'

In [None]:
df_logistic_reg['radon_binary'] = df_logistic_reg['radon'].apply(lambda x: 1 if x > 300 else 0)
df_logistic_reg = df_logistic_reg.drop(columns=["radon"])
df_logistic_reg

Unnamed: 0,temperature,humidity,pressure,tvoc,state,radon_binary
0,25,50,1015,0,0,0
1,25,51,1015,0,1,0
2,24,51,1015,0,0,0
3,24,51,1015,0,0,0
4,24,51,1015,0,0,0
...,...,...,...,...,...,...
87965,22,43,1018,23,0,1
87966,22,43,1018,34,0,1
87967,22,43,1018,34,0,1
87968,22,43,1018,34,0,1


# Logistic Regression: Model Train/Test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Independent variables
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon_binary"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df_logistic_reg[independent_vars], df_logistic_reg[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store accuracy scores
accuracy_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000) # Increase max_iter if the algorithm does not converge
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Store the accuracy score in the dictionary
    accuracy_scores[combo] = accuracy

# Find the combination with the highest accuracy score
best_combo = max(accuracy_scores, key=accuracy_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest accuracy score:", accuracy_scores[best_combo])

Best combination of independent variables: ('temperature', 'state')
Highest accuracy score: 0.7175741730135273
