# Multiple Machine Learning Techniques for GitHub Dataset

In [47]:
import numpy as np
import pandas as pd

In [48]:
df = pd.read_csv('../data/radon-data.csv')

In [49]:
df

Unnamed: 0,id,time,radon,temperature,humidity,pressure,tvoc,sensor_id,state,state_time
0,21906,1569405062,202,25,50,1015,0,2,Off,1569404979
1,21907,1569405663,258,25,51,1015,0,2,On,1569405215
2,21908,1569406264,202,24,51,1015,0,2,Off,1569405671
3,21909,1569406865,182,24,51,1015,0,2,Off,1569406848
4,21910,1569407466,189,24,51,1015,0,2,Off,1569406866
...,...,...,...,...,...,...,...,...,...,...
87965,138640,1622736862,1344,22,43,1018,23,2,Off,1622736447
87966,138642,1622737462,1293,22,43,1018,34,2,Off,1622736963
87967,138644,1622738063,1223,22,43,1018,34,2,Off,1622736963
87968,138646,1622738663,1171,22,43,1018,34,2,Off,1622738463


# Linear Regression: Data Cleaning 

In [50]:
df = df.drop(columns=["time", "state_time", "id", "sensor_id", "state"])
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc
0,202,25,50,1015,0
1,258,25,51,1015,0
2,202,24,51,1015,0
3,182,24,51,1015,0
4,189,24,51,1015,0
...,...,...,...,...,...
87965,1344,22,43,1018,23
87966,1293,22,43,1018,34
87967,1223,22,43,1018,34
87968,1171,22,43,1018,34


In [31]:
##df['state'] = df['state'].replace({'Off': 0, 'On': 1})

In [8]:
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc,state
0,202,25,50,1015,0,0
1,258,25,51,1015,0,1
2,202,24,51,1015,0,0
3,182,24,51,1015,0,0
4,189,24,51,1015,0,0
...,...,...,...,...,...,...
87965,1344,22,43,1018,23,0
87966,1293,22,43,1018,34,0
87967,1223,22,43,1018,34,0
87968,1171,22,43,1018,34,0


In [52]:
df_summer = df.iloc[36000:48000]
df_summer.head()


Unnamed: 0,radon,temperature,humidity,pressure,tvoc
36000,388,24,57,1010,4
36001,383,24,57,1010,2
36002,398,24,57,1010,6
36003,388,24,57,1010,8
36004,388,24,57,1010,10


In [33]:
##df = df.drop(columns=["state"])

In [53]:
df_summer

Unnamed: 0,radon,temperature,humidity,pressure,tvoc
36000,388,24,57,1010,4
36001,383,24,57,1010,2
36002,398,24,57,1010,6
36003,388,24,57,1010,8
36004,388,24,57,1010,10
...,...,...,...,...,...
47995,172,26,52,1011,63
47996,168,26,52,1011,65
47997,202,26,52,1011,62
47998,202,26,52,1011,54


In [54]:
summer_min = df_summer.min() # needed to de-normalize data
summer_max = df_summer.max()

df_summer_normalized = (df_summer - summer_min) / (summer_max - summer_min)
df_summer_normalized.head()

Unnamed: 0,radon,temperature,humidity,pressure,tvoc
36000,0.11964,0.714286,0.666667,0.5,0.003463
36001,0.117911,0.714286,0.666667,0.5,0.001732
36002,0.123098,0.714286,0.666667,0.5,0.005195
36003,0.11964,0.714286,0.666667,0.5,0.006926
36004,0.11964,0.714286,0.666667,0.5,0.008658


In [56]:
df = df_summer_normalized

In [57]:
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc
36000,0.119640,0.714286,0.666667,0.500000,0.003463
36001,0.117911,0.714286,0.666667,0.500000,0.001732
36002,0.123098,0.714286,0.666667,0.500000,0.005195
36003,0.119640,0.714286,0.666667,0.500000,0.006926
36004,0.119640,0.714286,0.666667,0.500000,0.008658
...,...,...,...,...,...
47995,0.044952,1.000000,0.481481,0.541667,0.054545
47996,0.043568,1.000000,0.481481,0.541667,0.056277
47997,0.055325,1.000000,0.481481,0.541667,0.053680
47998,0.055325,1.000000,0.481481,0.541667,0.046753


# Linear Regression: Model Train/Test

In [9]:
from itertools import chain, combinations
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [14]:
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store R-squared scores
r2_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)

    # Store the R-squared score in the dictionary
    r2_scores[combo] = r2

# Find the combination with the highest R-squared score
best_combo = max(r2_scores, key=r2_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_scores[best_combo])

Best combination of independent variables: ('pressure', 'tvoc', 'state')
Highest R-squared score: 0.17172401496142042


# Logistic Regression: Data Cleaning

In [60]:
df_logistic_reg = pd.read_csv('../data/radon-data.csv')
df_logistic_reg = df_logistic_reg.drop(columns=["time", "state_time", "id", "sensor_id"])
df_logistic_reg['state'] = df_logistic_reg['state'].replace({'Off': 0, 'On': 1})
df_logistic_reg['radon_binary'] = df_logistic_reg['radon'].apply(lambda x: 1 if x > 300 else 0)
correlation = df_logistic_reg['radon_binary'].corr(df_logistic_reg['state'])
print(correlation)


0.3717580959030975


In [54]:
df_logistic_reg['radon_binary'] = df_logistic_reg['radon'].apply(lambda x: 1 if x > 300 else 0)
df_logistic_reg = df_logistic_reg.drop(columns=["radon"])
df_logistic_reg
##correlation = df_logistic_reg['radon_binary'].corr(df_logistic_reg['state'])
##print(correlation)

KeyError: 'radon'

# Logistic Regression: Model Train/Test

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import chain, combinations

# Independent variables
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon_binary"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df_logistic_reg[independent_vars], df_logistic_reg[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store accuracy scores
accuracy_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000) # Increase max_iter if the algorithm does not converge
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Store the accuracy score in the dictionary
    accuracy_scores[combo] = accuracy

# Find the combination with the highest accuracy score
best_combo = max(accuracy_scores, key=accuracy_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest accuracy score:", accuracy_scores[best_combo])

Best combination of independent variables: ('temperature', 'state')
Highest accuracy score: 0.7175741730135273


# Neural Network: Model Train/Test

In [21]:
from sklearn.neural_network import MLPRegressor
from itertools import chain, combinations


# Independent variables
independent_vars = ["temperature", "humidity", "pressure", "tvoc", "state"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store R-squared scores
r2_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train a neural network regression model
    model = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42) # Adjust hyperparameters as needed
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)

    # Store the R-squared score in a dictionary
    r2_scores[combo] = r2
    
    #Find the combination with the highest R-squared score
    best_combo = max(r2_scores, key=r2_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_scores[best_combo])

Best combination of independent variables: ('temperature', 'humidity', 'pressure', 'state')
Highest R-squared score: 0.2045970236774428


# Decision Trees

In [58]:
df

Unnamed: 0,radon,temperature,humidity,pressure,tvoc
36000,0.119640,0.714286,0.666667,0.500000,0.003463
36001,0.117911,0.714286,0.666667,0.500000,0.001732
36002,0.123098,0.714286,0.666667,0.500000,0.005195
36003,0.119640,0.714286,0.666667,0.500000,0.006926
36004,0.119640,0.714286,0.666667,0.500000,0.008658
...,...,...,...,...,...
47995,0.044952,1.000000,0.481481,0.541667,0.054545
47996,0.043568,1.000000,0.481481,0.541667,0.056277
47997,0.055325,1.000000,0.481481,0.541667,0.053680
47998,0.055325,1.000000,0.481481,0.541667,0.046753


### Decision Trees: Model Train/Test w/Cross-Validation

In [59]:
import xgboost as xgb
from itertools import chain, combinations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Independent variables
independent_vars = ["temperature", "humidity", "pressure", "tvoc"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store mean cross-validation scores
mean_cv_scores = {}

# Number of folds for cross-validation
cv_folds = 5

# Iterate over each combination of independent variables
for combo in combinations:
    # Train an XGBoost regression model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42) # Adjust hyperparameters as needed
    
    # Perform cross-validation and calculate mean score
    cv_scores = cross_val_score(model, X_train[list(combo)], y_train, cv=cv_folds, scoring='r2')
    mean_cv_score = np.mean(cv_scores)
    
    # Store the mean cross-validation score in a dictionary
    mean_cv_scores[combo] = mean_cv_score

# Find the combination with the highest mean cross-validation score
best_combo = max(mean_cv_scores, key=mean_cv_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest mean cross-validation score:", mean_cv_scores[best_combo])

# Train the final model using the best combination of independent variables and the entire training set
best_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42) # Adjust hyperparameters as needed
best_model.fit(X_train[list(best_combo)], y_train)

# Make predictions using the test set
y_pred = best_model.predict(X_test[list(best_combo)])

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)

print("R-squared score on the test set:", r2)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error on the test set:", mse)

Best combination of independent variables: ('temperature', 'humidity', 'pressure', 'tvoc')
Highest mean cross-validation score: 0.6095759302301081
R-squared score on the test set: 0.6134782471496891
Mean Squared Error on the test set: 0.0009671201704714494


# Decision Trees: Model Train/Test

In [60]:
import xgboost as xgb
from itertools import chain, combinations
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Independent variables
independent_vars = ["temperature", "humidity", "pressure", "tvoc"]

# Function to get all combinations of the independent variables
def all_combinations(variables):
    return list(chain(*map(lambda x: combinations(variables, x), range(1, len(variables) + 1))))

# Get all combinations of independent variables
combinations = all_combinations(independent_vars)

# Dependent variable
dependent_var = "radon"

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

# Initialize a dictionary to store R-squared scores
r2_scores = {}
mae_scores = {}
mse_scores = {}

# Iterate over each combination of independent variables
for combo in combinations:
    # Train an XGBoost regression model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42) # Adjust hyperparameters as needed
    model.fit(X_train[list(combo)], y_train)

    # Make predictions using the test set
    y_pred = model.predict(X_test[list(combo)])

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Store the R-squared score in a dictionary
    r2_scores[combo] = r2
    mae_scores[combo] = mae
    mse_scores[combo] = mse

# Find the combination with the highest R-squared score
best_combo = max(r2_scores, key=r2_scores.get)

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_scores[best_combo])
print("Mean absolute error:", mae_scores[best_combo])
print("Mean squared error:", mse_scores[best_combo])

Best combination of independent variables: ('temperature', 'humidity', 'pressure', 'tvoc')
Highest R-squared score: 0.6134782471496891
Mean absolute error: 0.02014222705939061
Mean squared error: 0.0009671201704714494


## Best Model

In [61]:
X_train, X_test, y_train, y_test = train_test_split(df[independent_vars], df[dependent_var], test_size=0.2, random_state=42)

summer_min = df_summer.min()
summer_max = df_summer.max()

r2_best_scores = {}
mae_best_scores = {}
mse_best_scores = {}

best_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
best_model.fit(X_train[list(best_combo)], y_train)

y_pred = best_model.predict(X_test[list(best_combo)])

r2_best = r2_score(y_test, y_pred)
mae_best = mean_absolute_error(y_test, y_pred)
mse_best = mean_squared_error(y_test, y_pred)

mse_unnormalized = mse_best * (summer_max["radon"] - summer_min["radon"])**2
mae_unnormalized = mae_best * (summer_max["radon"] - summer_min["radon"])

print("Best combination of independent variables:", best_combo)
print("Highest R-squared score:", r2_best)
print("MSE:", mse_unnormalized)
print("MAE:", mae_unnormalized)
print("MSE:", mse_best)
print("MAE:", mae_best)

Best combination of independent variables: ('temperature', 'humidity', 'pressure', 'tvoc')
Highest R-squared score: 0.6134782471496891
MSE: 8088.668153445924
MAE: 58.25132065575764
MSE: 0.0009671201704714494
MAE: 0.02014222705939061


### Predicting new Data

In [62]:
summer_min = df_summer.min()
summer_max = df_summer.max()

independent_min = df_summer.drop(columns=["radon"]).min()
independent_max = df_summer.drop(columns=["radon"]).max()

new_data = {"temperature": 25, "humidity": 50, "pressure": 1017, "tvoc": 2}

new_data_df = pd.DataFrame(new_data, index=[0])

new_data_normalized = (new_data_df - independent_min) / (independent_max - independent_min)

prediction = best_model.predict(new_data_normalized.values.reshape(1, -1))[0]
##prediction = best_model.predict(pd.DataFrame([new_data_normalized])[list(best_combo)])

unnormalized_prediction = prediction * (summer_max["radon"] - summer_min["radon"]) + summer_min["radon"]

print("Predicted radon concentration", unnormalized_prediction)

Predicted radon concentration 322.53340280056
