In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import pandas as pd
import numpy as np

from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
%matplotlib inline


# minute weather Data description



The "minute weather dataset** comes from the same source as the daily weather dataset that we used in the decision tree based classifier notebook. The main difference between these two datasets is that the minute weather dataset contains raw sensor measurements captured at one-minute intervals. Daily weather dataset instead contained processed and well curated data. The data is in the file **minute_weather.csv", which is a comma-separated file.

As with the daily weather data, this data comes from a weather station located in San Diego, California. The weather station is equipped with sensors that capture weather-related measurements such as air temperature, air pressure, and relative humidity. Data was collected for a period of three years, from September 2011 to September 2014, to ensure that sufficient data for different seasons and weather conditions is captured.

Each row in minute_weather.csv contains weather data captured for a one-minute interval. Each row, or sample, consists of the following variables:

• rowID: unique number for each row (Unit: NA)

• hpwren_timestamp: timestamp of measure (Unit: year-month-day hour minute second)

• air pressure: air pressure measured at the timestamp (Unit: hectopascals)

• air_temp: air temperature measure at the timestamp (Unit. degrees Fahrenheit)

• avg_wind_direction: wind direction averaged over the minute before the timestamp (Unit: degrees, with 0 means coming from the North, and increasing clockwise)

• avg_wind_speed: wind speed averaged over the minute before the timestamp (Unit: meters per second) 

• max_wind_direction: highest wind direction in the minute before the timestamp (Unit: degrees, with 0 being North and increasing clockwise)

• max_wind_speed: highest wind speed in the minute before the timestamp (Unit: meters per second)

• min_wind_direction: smallest wind direction in the minute before the timestamp (Unit degrees, with 0 being North and inceasing clockwise)

• min_wind_speed: smallest wind speed in the minute before the timestamp (Unit meters per second) 

• rain_accumulation: amount of accumulated rain measured at the timestamp (Unit. millimeters)

• rain_duration: length of time rain has fallen as measured at the timestamp (Unit: seconds)

• relative humidity: relative humidity measured at the timestamp (Unit percent)

In [2]:
data=pd.read_csv('D:/py/minute_weather.csv')

In [3]:
data.shape

(1587257, 13)

In [4]:
data.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [16]:
# lots of row here. lets take sample by taking every 10 row

sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

(158726, 13)

In [17]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,158726.0,793625.0,458203.937509,0.0,396812.5,793625.0,1190437.5,1587250.0
air_pressure,158726.0,916.830161,3.051717,905.0,914.8,916.7,918.7,929.5
air_temp,158726.0,61.851589,11.833569,31.64,52.7,62.24,70.88,99.5
avg_wind_direction,158680.0,162.1561,95.278201,0.0,62.0,182.0,217.0,359.0
avg_wind_speed,158680.0,2.775215,2.057624,0.0,1.3,2.2,3.8,31.9
max_wind_direction,158680.0,163.462144,92.452139,0.0,68.0,187.0,223.0,359.0
max_wind_speed,158680.0,3.400558,2.418802,0.1,1.6,2.7,4.6,36.0
min_wind_direction,158680.0,166.774017,97.441109,0.0,76.0,180.0,212.0,359.0
min_wind_speed,158680.0,2.134664,1.742113,0.0,0.8,1.6,3.0,31.6
rain_accumulation,158725.0,0.000318,0.011236,0.0,0.0,0.0,0.0,3.12


In [18]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

(157812, 13)

In [19]:
sampled_df[sampled_df['rain_duration'] == 0].shape

(157237, 13)

In [20]:
# drop all rows of empty rain_duration and rain_accumalation

del sampled_df['rain_accumulation']
del sampled_df['rain_duration'] 

In [21]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [22]:
# how many rows did we drope
rows_before - rows_after

46

In [23]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [24]:
features = ['air_pressure','air_temp','min_wind_direction','min_wind_speed','min_wind_direction','max_wind_speed','relative_humidity']

In [25]:
select_df = sampled_df[features]

In [26]:
select_df.columns

Index(['air_pressure', 'air_temp', 'min_wind_direction', 'min_wind_speed',
       'min_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [27]:
select_df

Unnamed: 0,air_pressure,air_temp,min_wind_direction,min_wind_speed,min_wind_direction.1,max_wind_speed,relative_humidity
0,912.3,64.76,85.0,1.0,85.0,1.6,60.5
10,912.3,62.24,115.0,0.6,115.0,1.8,38.5
20,912.2,63.32,91.0,1.5,91.0,2.5,58.3
30,912.2,62.60,71.0,1.4,71.0,2.4,57.9
40,912.2,64.04,68.0,1.4,68.0,2.9,57.4
...,...,...,...,...,...,...,...
1587210,915.9,75.56,310.0,0.8,310.0,1.3,47.8
1587220,915.9,75.56,316.0,0.9,316.0,1.4,48.0
1587230,915.9,75.56,338.0,1.2,338.0,1.7,48.0
1587240,915.9,75.20,347.0,1.0,347.0,1.6,46.3


In [28]:
# scale the features using standard scaler
# Standarsd scaler: is use to convert column data into a range
# Countvector= use to convert row data into a range
# Fit transform: use with above both method. This simply implement the conversation task.

X = StandardScaler().fit_transform(select_df)
X

array([[-1.48456281,  0.24544455, -0.8392174 , ..., -0.8392174 ,
        -0.74440309,  0.49233835],
       [-1.48456281,  0.03247142, -0.53133816, ..., -0.53133816,
        -0.66171726, -0.34710804],
       [-1.51733167,  0.12374562, -0.77764156, ..., -0.77764156,
        -0.37231683,  0.40839371],
       ...,
       [-0.30488381,  1.15818654,  1.75723085, ...,  1.75723085,
        -0.70306017,  0.01538018],
       [-0.30488381,  1.12776181,  1.84959462, ...,  1.84959462,
        -0.74440309, -0.04948614],
       [-0.30488381,  1.09733708,  1.8701199 , ...,  1.8701199 ,
        -0.62037434, -0.05711747]])

In [29]:
# use KMeans clustering 
# (n_clusters = 12) its create 12 cluster. we have to deside how many cluster we want to create.

kmeans = KMeans(n_clusters = 12)
model = kmeans.fit(X)
print('model:\n', model)

# we have formed 12 clusters

model:
 KMeans(n_clusters=12)


In [30]:
centers = model.cluster_centers_
centers
# we can see here total 12 cluster are created 

array([[ 0.69812977,  0.57049415,  0.21166664, -0.52457704,  0.21166664,
        -0.50619722, -0.75560225],
       [-0.19005487,  0.81507113, -1.30365382, -0.54881037, -1.30365382,
        -0.6446248 , -0.59672298],
       [ 0.06005938, -0.971925  ,  0.2131141 , -0.43631308,  0.2131141 ,
        -0.42135192,  1.20121337],
       [-1.13771802, -0.79692146,  0.28510162,  2.09568135,  0.28510162,
         1.97845144,  0.84281407],
       [ 1.35945621, -0.07120849, -1.30460039, -0.04138299, -1.30460039,
        -0.0056431 , -0.97639383],
       [ 1.18160254, -0.27958799, -1.31947656,  1.94393916, -1.31947656,
         2.24484717, -1.12694476],
       [-0.96138705, -1.24668187,  0.20878598,  0.51373548,  0.20878598,
         0.53722093,  1.38298398],
       [-0.20350944,  0.60161285,  0.24994962,  0.83785004,  0.24994962,
         0.68598097, -0.15159562],
       [ 0.09235571,  0.87601891,  1.55485441, -0.6430895 ,  1.55485441,
        -0.55033496, -0.79159617],
       [-0.0554101 , -0.7655