In [1]:
# http://db.csail.mit.edu/labdata/labdata.html

In [10]:
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression # importa o modelo
from sklearn.metrics import mean_squared_error
# import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import sklearn.manifold as skm
from sklearn import metrics
import scipy.stats as stats
import seaborn as sns
import random as rdn
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import time

## Reading Dataset

In [3]:
# setting names to columns
data_column_names = ["Date", "Time", "Epoch", "Moteid", "Temperature", "Humidity", "Light", "Voltage"]
data = pd.read_csv("data.txt", delimiter = " ", names = data_column_names)

In [3]:
data.head(3)

Unnamed: 0,Date,Time,Epoch,Moteid,Temperature,Humidity,Light,Voltage
0,2004-03-31,03:38:15.757551,2,1.0,122.153,-3.91901,11.04,2.03397
1,2004-02-28,00:59:16.02785,3,1.0,19.9884,37.0933,45.08,2.69964
2,2004-02-28,01:03:16.33393,11,1.0,19.3024,38.4629,45.08,2.68742


### Null values

In [4]:
# removing null and inf id's
data.dropna(subset = ['Moteid'], inplace = True)
data['Moteid'] = data['Moteid'].astype(np.int) # moteid id the id sensor in the lab

# change features order
data = data[["Moteid", "Epoch", "Humidity", "Light", "Voltage", "Date", "Time", "Temperature"]]

In [5]:
data.head(3)

Unnamed: 0,Moteid,Epoch,Humidity,Light,Voltage,Date,Time,Temperature
0,1,2,-3.91901,11.04,2.03397,2004-03-31,03:38:15.757551,122.153
1,1,3,37.0933,45.08,2.69964,2004-02-28,00:59:16.02785,19.9884
2,1,11,38.4629,45.08,2.68742,2004-02-28,01:03:16.33393,19.3024


In [17]:
data.isnull().sum()/data.shape[0] * 100

Moteid         0.000000
Epoch          0.000000
Humidity       0.016255
Light          4.035698
Voltage        0.000000
Date           0.000000
Time           0.000000
Temperature    0.016212
dtype: float64

## Change dataset

In [5]:
# dataset to model
new_data = data[data['Epoch'] > 1].copy()
new_data = new_data.dropna() # removing missing data
new_data.isnull().sum() # check

# excluindo épocas nulas
new_data = new_data[(new_data["Moteid"] < 54) & (new_data["Moteid"] > 1)]  

In [5]:
new_data['Epoch'].unique().shape

(65534,)

## Creating a model

## 100 firts Epochs

In [6]:
# array of epochs
epochs = new_data['Epoch'].unique()
epochs = epochs[:100]

In [10]:
epochs

array([  2,  21,  25,  58,  61,  72,  74,  78,  79,  81,  83,  84,  86,
        88,  89,  94,  95,  96,  97,  98, 104, 105, 108, 109, 112, 113,
       117, 129, 130, 131, 133, 137, 138, 139, 140, 141, 142, 143, 155,
       161, 164, 165, 171, 174, 180, 182, 183, 184, 185, 189, 190, 191,
       193, 195, 197, 206, 207, 209, 218, 219, 220, 221, 222, 223, 227,
       233, 234, 236, 240, 241, 242, 244, 245, 248, 250, 251, 260, 262,
       263, 264, 267, 268, 272, 277, 279, 281, 282, 284, 291, 293, 294,
       296, 299, 300, 302, 304, 306, 312, 314, 315])

In [11]:
new_data.shape  # Número de linhas e colunas

(2138311, 8)

In [7]:
new_data = new_data[new_data['Epoch'].isin(epochs)]

In [8]:
regrGeral = LinearRegression()
fig = plt.figure()
ax = fig.gca(projection='3d')

for feature in ['Light', 'Voltage', 'Humidity']:
    for ep in new_data:        
        X_train, y_train = new_data[[feature]], new_data['Temperature']
        regrGeral.fit(X_train, y_train) # fitting model

        ax.plot_surface(X_train, y_train, Z, rstride=1, cstride=1, alpha=0.5)

        ax.scatter(x, y, z, c='r', s=10,  depthshade=True)
        plt.xlabel('X')
        plt.ylabel('Y')
        ax.set_zlabel('Z')
        ax.axis('equal')
        ax.axis('tight')
        plt.show()

        
#epochs_error_mean
# pd.DataFrame(np.array(features_G).T, columns = ['Temp~Light', 'Temp~Voltage', 'Temp~Humidity']).set_index(epochs).reset_index().rename(columns = {'index': 'Epoch'})
#np.array(features_).shape

In [17]:
# !pip install statsmodels

In [18]:
import statsmodels.formula.api as smf
model = smf.ols(formula='Temperature ~ Light + Voltage + Humidity', data=new_data)
regrGeralF = model.fit()
regrGeralF.params

ModuleNotFoundError: No module named 'statsmodels'

In [14]:
x_surf, y_surf = np.meshgrid(np.linspace(new_data.Light.min(), new_data.Light.max(), 100),np.linspace(new_data.Voltage.min(), new_data.Voltage.max(), 100))
onlyX = pd.DataFrame({'Light': x_surf.ravel(), 'Voltage': y_surf.ravel()})
fittedY=regrGeralF.predict(exog=onlyX)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(new_data['Light'],df2['Voltage'],df2['Humidity'],c='blue', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,fittedY.reshape(x_surf.shape), color='None', alpha=0.01)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()

TypeError: predict() got an unexpected keyword argument 'exog'

In [11]:
new_data.columns

Index(['Moteid', 'Epoch', 'Humidity', 'Light', 'Voltage', 'Date', 'Time',
       'Temperature'],
      dtype='object')

Inicialmente faremos a regressão para a variavel algo sendo a Temperatura e a dependente sendo a Luz, Voltaem, Umidade respectivamente.

## Validação por Epoch. Exemplo: Predizendo uma época com base nas outras

In [14]:
regr = LinearRegression()

features_ = []
for feature in ['Light', 'Voltage', 'Humidity']:
    epochs_error_mean = []
    epochs_error_sqrt = []
    for ep in epochs:
        
        X_train, y_train = new_data[new_data['Epoch'] != ep][[feature]], new_data[new_data['Epoch'] != ep]['Temperature']
        X_test, y_test = new_data[new_data['Epoch'] == ep][[feature]], new_data[new_data['Epoch'] == ep]['Temperature']

        regr.fit(X_train, y_train) # fitting model

        y_pred = regr.predict(X_test)
        error = mean_squared_error(y_test, y_pred)

        epochs_error_mean.append(error) # array of mean squared errors
        epochs_error_sqrt.append(np.sqrt(error)) # array of sqrt mean squared error
    
    features_.append(epochs_error_mean)
#epochs_error_mean
pd.DataFrame(np.array(features_).T, columns = ['Temp~Light', 'Temp~Voltage', 'Temp~Humidity']).set_index(epochs).reset_index().rename(columns = {'index': 'Epoch'})
#np.array(features_).shape

Unnamed: 0,Epoch,Temp~Light,Temp~Voltage,Temp~Humidity
0,2,3961.620902,2503.952731,810.362194
1,21,577.129061,404.406692,18.952505
2,25,563.288685,384.665321,17.007517
3,58,585.127220,294.619555,16.060900
4,61,2.964432,26.905376,8.729131
5,72,2.059346,35.836802,12.249856
6,74,3.292753,9.723325,8.498385
7,78,3.307362,10.978189,11.907452
8,79,2.948696,34.542386,8.574409
9,81,2.898952,9.810723,9.343079


## Testando o modelo para cada medição da época

In [15]:
 #Exemplo de um modelo para esimar a umidade com base na temperatura
X2 = new_data[['Humidity']]
y2 = new_data['Temperature']
regr2 = LinearRegression()
regr2.fit(X2, y2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Iterando em cada época e pegando o erro quadrático da regressão.
## Em seguida pegamos a médis dos erros em cada época.

In [1]:

mean_epoch=[]
mean_epoch_mean=[]

for i in new_data.Epoch.unique():
    aux = new_data[new_data['Epoch'] == i]
#     print(aux.shape)
    scores = cross_val_score(regr2, X2, y2, cv=aux.shape[0], scoring='mean_squared_error')
    mean_epoch_mean.append(abs(scores).mean())  
#     mean_epoch.append(abs(scores)) 
#     if count==38:
#         break

# Erro quadrático médio para cada as 100 primeiras épocas 

In [22]:
print(mean_epoch_mean) ## Erro quadrático médio da temperatura em relação a umidade

[21.365476419244047, 21.345127791318813, 21.41650913057723, 21.22361025860069, 21.22361025860069, 21.315466508895778, 21.41650913057723, 21.40101601614022, 21.345127791318813, 21.19542209264754, 21.40101601614022, 21.122002400196397, 21.41127763320491, 21.110040527485125, 21.41650913057723, 21.122002400196397, 21.504703651760057, 21.18825647176594, 21.315466508895778, 21.122002400196397, 21.345127791318813, 21.315466508895778, 21.315466508895778, 21.40101601614022, 21.19542209264754, 21.19542209264754, 21.315466508895778, 21.504703651760057, 21.40101601614022, 21.345127791318813, 21.504703651760057, 21.41650913057723, 21.110040527485125, 21.504703651760057, 21.110040527485125, 21.40101601614022, 21.110040527485125, 21.315466508895778, 21.110040527485125, 21.40101601614022, 21.19542209264754, 21.504703651760057, 21.315466508895778, 21.38466315358345, 21.41650913057723, 21.40101601614022, 21.40101601614022, 21.315466508895778, 21.122002400196397, 21.42764960649494, 21.110040527485125, 21

In [23]:
 #Exemplo de um modelo para esimar a Luz com base na temperatura
X3 = new_data[['Light']]
y3 = new_data['Temperature']
regr3 = LinearRegression()
regr3.fit(X3, y3)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [2]:

mean_epoch2=[]
mean_epoch_mean2=[]

for i in new_data.Epoch.unique():
    aux = new_data[new_data['Epoch'] == i]
#     print(aux.shape)
    scores = cross_val_score(regr3, X3, y3, cv=aux.shape[0], scoring='mean_squared_error')
    mean_epoch_mean2.append(abs(scores).mean())  
#     mean_epoch2.append(abs(scores)) 
#     if count==38:
#         break

In [36]:
print(mean_epoch_mean2)  # Erro quadrático médio da temperatura em relação a luz

[111.06417843452626, 112.21860772454622, 111.65595808343882, 112.68097615039133, 112.68097615039133, 112.20795083244057, 111.65595808343882, 110.84074151532364, 112.21860772454622, 110.38413465564919, 110.84074151532364, 110.58397788073498, 110.70501819776668, 112.78615138164102, 111.65595808343882, 110.58397788073498, 111.93317807096979, 111.93834730744598, 112.20795083244057, 110.58397788073498, 112.21860772454622, 112.20795083244057, 112.20795083244057, 110.84074151532364, 110.38413465564919, 110.38413465564919, 112.20795083244057, 111.93317807096979, 110.84074151532364, 112.21860772454622, 111.93317807096979, 111.65595808343882, 112.78615138164102, 111.93317807096979, 112.78615138164102, 110.84074151532364, 112.78615138164102, 112.20795083244057, 112.78615138164102, 110.84074151532364, 110.38413465564919, 111.93317807096979, 112.20795083244057, 110.4495243589996, 111.65595808343882, 110.84074151532364, 110.84074151532364, 112.20795083244057, 110.58397788073498, 112.236171625495, 11

In [26]:
 #Exemplo de um modelo para esimar a Luz com base na voltagem
X4 = new_data[['Voltage']]
y4 = new_data['Temperature']
regr4 = LinearRegression()
regr4.fit(X4, y4)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [3]:

mean_epoch3=[]
mean_epoch_mean3=[]

for i in new_data.Epoch.unique():
    aux = new_data[new_data['Epoch'] == i]
#     print(aux.shape)
    scores = cross_val_score(regr4, X4, y4, cv=aux.shape[0], scoring='mean_squared_error')
    mean_epoch_mean3.append(abs(scores).mean())  
#     mean_epoch3.append(abs(scores)) 
#     if count==38:
#         break

In [38]:
print(mean_epoch_mean3)  # Erro quadrático médio da temperatura em relação a Voltagem

[88.06393007061043, 84.54332945390902, 101.60108809751385, 94.34960608228698, 94.34960608228698, 101.69731184454491, 101.60108809751385, 84.25748650468441, 84.54332945390902, 93.31899896535019, 84.25748650468441, 95.84355193885841, 81.77428412648607, 91.80241372228788, 101.60108809751385, 95.84355193885841, 101.8567893631672, 87.35379940473494, 101.69731184454491, 95.84355193885841, 84.54332945390902, 101.69731184454491, 101.69731184454491, 84.25748650468441, 93.31899896535019, 93.31899896535019, 101.69731184454491, 101.8567893631672, 84.25748650468441, 84.54332945390902, 101.8567893631672, 101.60108809751385, 91.80241372228788, 101.8567893631672, 91.80241372228788, 84.25748650468441, 91.80241372228788, 101.69731184454491, 91.80241372228788, 84.25748650468441, 93.31899896535019, 101.8567893631672, 101.69731184454491, 101.12752739263861, 101.60108809751385, 84.25748650468441, 84.25748650468441, 101.69731184454491, 95.84355193885841, 82.27417720766199, 91.80241372228788, 82.2741772076619