In [1]:
import pandas as pd

bucket='kizsensordata'
data_key = 'datatraining.txt'
data_location = 's3://{}/{}'.format(bucket, data_key)

data1 = pd.read_csv(data_location, header=0, index_col=1, parse_dates=True, squeeze=True)

In [2]:
data_key2 = 'datatest.txt'
data_location2 = 's3://{}/{}'.format(bucket, data_key2)

data2 = pd.read_csv(data_location2, header=0, index_col=1, parse_dates=True, squeeze=True)

In [3]:
data_key3 = 'datatest2.txt'
data_location3 = 's3://{}/{}'.format(bucket, data_key3)

data3 = pd.read_csv(data_location3, header=0, index_col=1, parse_dates=True, squeeze=True)

In [4]:
import matplotlib.pyplot as pyplot
n_features = data1.values.shape[1]
pyplot.figure()
for i in range(1, n_features):
	# specify the subpout
	pyplot.subplot(n_features, 1, i)
	# plot data from each set
	pyplot.plot(data1.index, data1.values[:, i])
	pyplot.plot(data2.index, data2.values[:, i])
	pyplot.plot(data3.index, data3.values[:, i])
	# add a readable name to the plot
	pyplot.title(data1.columns[i], y=0.5, loc='right')
pyplot.show()


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


<Figure size 640x480 with 6 Axes>

In [5]:
data = pd.concat([data1, data2, data3])
# save aggregated dataset
data.to_csv('combined.csv')

In [6]:
data = pd.read_csv('combined.csv', header=0, index_col=0, parse_dates=True, squeeze=True)
values = data.values
# split data into inputs and outputs
X, y = values[:, :-1], values[:, -1]
# split the dataset
from sklearn.model_selection import train_test_split
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=1)

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# make a naive prediction
def naive_prediction(testX, value):
	return [value for x in range(len(testX))]
 
# evaluate skill of predicting each class value
for value in [0, 1]:
	# forecast
	yhat = naive_prediction(testX, value)
	# evaluate
	score = accuracy_score(testy, yhat)
	# summarize
	print('Naive=%d score=%.3f' % (value, score))

Naive=0 score=0.822
Naive=1 score=0.178


In [8]:
# logistic regression
from pandas import read_csv
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# load the dataset
data = read_csv('combined.csv', header=0, index_col=0, parse_dates=True, squeeze=True)
values = data.values
# split data into inputs and outputs
X, y = values[:, :-1], values[:, -1]
# split the dataset
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=1)
# define the model
model = LogisticRegression()
# fit the model on the training set
model.fit(trainX, trainy)
# predict the test set
yhat = model.predict(testX)
# evaluate model skill
score = accuracy_score(testy, yhat)
print(score)

0.9941634241245136




In [9]:
# More EDA - Summary Statistics
display(data.describe())

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,20560.0,20560.0,20560.0,20560.0,20560.0,20560.0,20560.0
mean,4116.57607,20.906212,27.655925,130.756622,690.553276,0.004228,0.231031
std,2684.372188,1.055315,4.982154,210.430875,311.201281,0.000768,0.421503
min,1.0,19.0,16.745,0.0,412.75,0.002674,0.0
25%,1760.0,20.2,24.5,0.0,460.0,0.003719,0.0
50%,3808.0,20.7,27.29,0.0,565.416667,0.004292,0.0
75%,6378.0,21.525,31.29,301.0,804.666667,0.004832,0.0
max,9752.0,24.408333,39.5,1697.25,2076.5,0.006476,1.0


In [10]:
import numpy as np
a = np.array(data).astype('float32')
labels = a[:,6]

In [11]:
import sagemaker
import boto3

sess = sagemaker.Session()
prefix = "sagemaker/grades"

In [12]:
import io
import sagemaker.amazon.common as smac
import os

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, a, labels)
buf.seek(0)

key = 'linearlearner'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://kizsensordata/sagemaker/grades/train/linearlearner
training artifacts will be uploaded to: s3://kizsensordata/sagemaker/grades/output


In [13]:
 containers = {
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest'
              }
    
#containers[boto3.Session().region_name]

In [14]:
linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role = sagemaker.get_execution_role(), 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)

In [15]:
# This is for 'dim' down below = 7
print(data.shape)

(20560, 7)


In [16]:
%%time
linear.set_hyperparameters(feature_dim=7,
                           mini_batch_size=200,
                           predictor_type='binary_classifier')

linear.fit({'train': s3_train_data})

2020-07-02 14:46:49 Starting - Starting the training job...
2020-07-02 14:46:51 Starting - Launching requested ML instances......
2020-07-02 14:47:54 Starting - Preparing the instances for training......
2020-07-02 14:48:55 Downloading - Downloading input data...
2020-07-02 14:49:45 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/02/2020 14:49:47 INFO 140019182729024] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_

In [17]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

-------------!

In [18]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [19]:
result = linear_predictor.predict(a[0])
print(result)

{'predictions': [{'score': 0.9984032511711121, 'predicted_label': 1}]}


In [20]:
a[0]

array([1.0000000e+00, 2.3180000e+01, 2.7271999e+01, 4.2600000e+02,
       7.2125000e+02, 4.7929883e-03, 1.0000000e+00], dtype=float32)

# Train data, Test data

In [21]:
#train_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data))])
#print(train_data.shape, test_data.shape)

In [22]:
#result2 = linear_predictor.predict(test_data[0])
#print(result2)
#test_data[0]