This Jupyter Notebook is part of the course [Python for Industry 4.0](https://www.udemy.com/course/python-for-industry-40/?referralCode=D7925A2D76BA4C94CA4E) from [Industry 4.0 Academy](https://www.i40a.com).

Latos© copyright 2022. All Rights Reserved.

# Numerical Computing Libraries - Class



## Numpy

### Numpy array and selecting data

In [None]:
# import library
import numpy as np

In [None]:
my_list = [1, 2, 3]

In [None]:
# transform list into numpy array
my_array_1d = np.array(my_list)
my_array_1d

array([1, 2, 3])

In [None]:
# create an array form 0 to 9 with step 1
x = np.arange(0, 10, 1)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
# create an array from 0 to 9 with 10 evenly spaced numbers
x = np.linspace(0, 9, 10)
x

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [None]:
my_matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [None]:
my_array_2d = np.array(my_matrix)
my_array_2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [None]:
# shape of the array
my_array_2d.shape

(3, 3)

In [None]:
# select element in second row and third column
my_array_2d[1][2]

6

In [None]:
my_array_2d[1, 2]

6

In [None]:
my_array_2d[1:, 1:]

array([[5, 6],
       [8, 9]])

In [None]:
# returns True for all values where the condition is true
ind = my_array_2d > 5
ind

array([[False, False, False],
       [False, False,  True],
       [ True,  True,  True]])

In [None]:
# select only values where the condition is true
my_array_2d[ind]

array([6, 7, 8, 9])

In [None]:
# select opposition from true
my_array_2d[~ind]

array([1, 2, 3, 4, 5])

In [None]:
# replace all true by a given value
my_array_2d[ind] = 5
my_array_2d

array([[1, 2, 3],
       [4, 5, 5],
       [5, 5, 5]])

In [None]:
np.ones((3, 3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [None]:
# array with normal distributed random values
np.random.randn(3, 3)

array([[-1.24298386,  0.46550139,  1.238581  ],
       [ 0.52571835,  0.16374902,  0.07541738],
       [-0.94310784, -0.79735328,  0.48767692]])

In [None]:
# array with uniform distributed random values
np.random.rand(3, 3)

array([[0.97482701, 0.40768465, 0.30173115],
       [0.71380032, 0.46380145, 0.12701674],
       [0.34973846, 0.52214612, 0.46813883]])

### Main operations


In [None]:
# add the lists, i.e., concatenate lists
my_list + my_list

[1, 2, 3, 1, 2, 3]

In [None]:
# add the arrays, the arithmetic operator will work element-wise
my_array_1d + my_array_1d

array([2, 4, 6])

In [None]:
# same for multiplication and other operations
my_array_1d * my_array_1d

array([1, 4, 9])

In [None]:
# arithmetic operation on the array with a scalar value
# opperation to each element
my_array_1d + 3

array([4, 5, 6])

In [None]:
# same for multiplication and other operations
my_array_1d / 3

array([0.33333333, 0.66666667, 1.        ])

In [None]:
# there are many functions built into Numpy
np.mean(my_array_2d)

5.0

In [None]:
np.std(my_array_2d)

2.581988897471611

In [None]:
# some methods too
my_array_2d.mean()

5.0

In [None]:
my_array_2d.sum()

45

In [None]:
my_array_2d.max()

9

In [None]:
np.exp(my_array_2d)

array([[2.71828183e+00, 7.38905610e+00, 2.00855369e+01],
       [5.45981500e+01, 1.48413159e+02, 4.03428793e+02],
       [1.09663316e+03, 2.98095799e+03, 8.10308393e+03]])

In [None]:
np.sin(my_array_2d)

array([[ 0.84147098,  0.90929743,  0.14112001],
       [-0.7568025 , -0.95892427, -0.2794155 ],
       [ 0.6569866 ,  0.98935825,  0.41211849]])

In [None]:
# for each column
np.mean(my_array_2d, axis=0) 

array([4., 5., 6.])

In [None]:
# for each row
np.mean(my_array_2d, axis=1)

array([2., 5., 8.])

## Pandas


### Series and DataFrames

In [None]:
import pandas as pd

In [None]:
# Series from list - index is set automatically
records = [1, 2, 3, 4]
ser = pd.Series(records)
ser

0    1
1    2
2    3
3    4
dtype: int64

In [None]:
# set index
index = ['a', 'b', 'c', 'd']
ser = pd.Series(records, index=index)
ser

a    1
b    2
c    3
d    4
dtype: int64

In [None]:
# Series from dictionary
records_dic = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=records_dic)
ser

a    1
b    2
c    3
dtype: int64

In [None]:
# DataFrame from dictionary
records_matrix = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
df = pd.DataFrame(data=records_matrix)
df

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6


In [None]:
# Dataframe from Numpy Array
# We will use a seed to generate an random array, this ensures we all have the same array
np.random.seed(123)
numpy_array = np.random.randn(4, 4)

df = pd.DataFrame(numpy_array, columns = ['col1', 'col2', 'col3', 'col4'], index=['a', 'b', 'c', 'd'])
df

Unnamed: 0,col1,col2,col3,col4
a,-1.085631,0.997345,0.282978,-1.506295
b,-0.5786,1.651437,-2.426679,-0.428913
c,1.265936,-0.86674,-0.678886,-0.094709
d,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
# select values
df.values

array([[-1.0856306 ,  0.99734545,  0.2829785 , -1.50629471],
       [-0.57860025,  1.65143654, -2.42667924, -0.42891263],
       [ 1.26593626, -0.8667404 , -0.67888615, -0.09470897],
       [ 1.49138963, -0.638902  , -0.44398196, -0.43435128]])

In [None]:
# select index
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [None]:
# select columns
df.columns

Index(['col1', 'col2', 'col3', 'col4'], dtype='object')

### Selecting data

More about selecting data: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

In [None]:
df

Unnamed: 0,col1,col2,col3,col4
a,-1.085631,0.997345,0.282978,-1.506295
b,-0.5786,1.651437,-2.426679,-0.428913
c,1.265936,-0.86674,-0.678886,-0.094709
d,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
# select column with name 'col1'
df.col1

a   -1.085631
b   -0.578600
c    1.265936
d    1.491390
Name: col1, dtype: float64

In [None]:
# alternative option
df['col1']

a   -1.085631
b   -0.578600
c    1.265936
d    1.491390
Name: col1, dtype: float64

In [None]:
# select two columns by name
df[['col1', 'col2']]

Unnamed: 0,col1,col2
a,-1.085631,0.997345
b,-0.5786,1.651437
c,1.265936,-0.86674
d,1.49139,-0.638902


In [None]:
# select rows by position
df.iloc[:2]

Unnamed: 0,col1,col2,col3,col4
a,-1.085631,0.997345,0.282978,-1.506295
b,-0.5786,1.651437,-2.426679,-0.428913


In [None]:
df.iloc[[1, 3]]

Unnamed: 0,col1,col2,col3,col4
b,-0.5786,1.651437,-2.426679,-0.428913
d,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
# select rows and columns by position
df.iloc[:2, :2]

Unnamed: 0,col1,col2
a,-1.085631,0.997345
b,-0.5786,1.651437


In [None]:
# replace selected values
df.iloc[:2, :2] = 0
df

Unnamed: 0,col1,col2,col3,col4
a,0.0,0.0,0.282978,-1.506295
b,0.0,0.0,-2.426679,-0.428913
c,1.265936,-0.86674,-0.678886,-0.094709
d,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
# find values based on condition
ind = df < 0
ind

Unnamed: 0,col1,col2,col3,col4
a,False,False,False,True
b,False,False,True,True
c,False,True,True,True
d,False,True,True,True


In [None]:
# replace values based on comparison
df[ind] = -1
df

Unnamed: 0,col1,col2,col3,col4
a,0.0,0.0,0.282978,-1.0
b,0.0,0.0,-1.0,-1.0
c,1.265936,-1.0,-1.0,-1.0
d,1.49139,-1.0,-1.0,-1.0


In [None]:
# use more than one comparison
ind = (df > 0) & (df < 2)
df[ind] = 1
df

Unnamed: 0,col1,col2,col3,col4
a,0.0,0.0,1.0,-1.0
b,0.0,0.0,-1.0,-1.0
c,1.0,-1.0,-1.0,-1.0
d,1.0,-1.0,-1.0,-1.0


In [None]:
# oposite (not)
df[~ind] = 0
df

Unnamed: 0,col1,col2,col3,col4
a,0.0,0.0,1.0,0.0
b,0.0,0.0,0.0,0.0
c,1.0,0.0,0.0,0.0
d,1.0,0.0,0.0,0.0


In [None]:
# select data based on value
ind = df['col1'] == 0
ind 

a     True
b     True
c    False
d    False
Name: col1, dtype: bool

In [None]:
# working with large dataset
numpy_array = np.random.randn(1000, 10)

df = pd.DataFrame(numpy_array)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.205930,2.186786,1.004054,0.386186,0.737369,1.490732,-0.935834,1.175829,-1.253881,-0.637752
1,0.907105,-1.428681,-0.140069,-0.861755,-0.255619,-2.798589,-1.771533,-0.699877,0.927462,-0.173636
2,0.002846,0.688223,-0.879536,0.283627,-0.805367,-1.727669,-0.390900,0.573806,0.338589,-0.011830
3,2.392365,0.412912,0.978736,2.238143,-1.294085,-1.038788,1.743712,-0.798063,0.029683,1.069316
4,0.890706,1.754886,1.495644,1.069393,-0.772709,0.794863,0.314272,-1.326265,1.417299,0.807237
...,...,...,...,...,...,...,...,...,...,...
995,-0.114699,0.745163,-0.635988,0.848133,-0.585542,0.086091,-0.101158,-0.418401,0.490092,0.703430
996,1.397362,-1.419400,2.826198,0.793412,0.293470,0.652308,0.567850,1.478910,-0.400071,-1.667558
997,-1.350780,-0.112624,0.539086,-0.537706,-1.060740,-1.285079,1.071255,1.910933,0.649171,-0.597918
998,-1.397932,-0.472266,0.581964,0.970613,-1.240970,-0.312947,-0.848947,2.377953,0.657501,0.213087


In [None]:
# return the first 5 rows.
df.head()

Unnamed: 0,0,1,2,3
0,2.20593,2.186786,1.004054,0.386186
1,0.737369,1.490732,-0.935834,1.175829
2,-1.253881,-0.637752,0.907105,-1.428681
3,-0.140069,-0.861755,-0.255619,-2.798589
4,-1.771533,-0.699877,0.927462,-0.173636


In [None]:
# return the last 5 rows.
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
995,-0.114699,0.745163,-0.635988,0.848133,-0.585542,0.086091,-0.101158,-0.418401,0.490092,0.70343
996,1.397362,-1.4194,2.826198,0.793412,0.29347,0.652308,0.56785,1.47891,-0.400071,-1.667558
997,-1.35078,-0.112624,0.539086,-0.537706,-1.06074,-1.285079,1.071255,1.910933,0.649171,-0.597918
998,-1.397932,-0.472266,0.581964,0.970613,-1.24097,-0.312947,-0.848947,2.377953,0.657501,0.213087
999,-0.49097,-1.08151,0.004801,-0.360797,0.012019,0.929787,-0.177048,-1.728908,-0.265183,0.712684


### Loading data and basic data exploration
This database will be used frequently in our classes. The database we are using has been simplified from the original.

Original dataset: https://archive.ics.uci.edu/ml/datasets/Steel+Industry+Energy+Consumption+Dataset

more about Pandas IO: https://pandas.pydata.org/docs/user_guide/io.html

In [None]:
path = 'https://raw.githubusercontent.com/i40a/datasets/main/steel_plant/mods/1/Steel_industry_data.csv' 
df = pd.read_csv(path, index_col='Date', parse_dates=True)

df.head()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:15:00,3.17,2.95,0.0,0.0,Monday
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,Monday
2018-01-01 00:45:00,3.24,3.28,0.0,0.0,Monday
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,Monday
2018-01-01 01:15:00,3.82,4.5,0.0,0.0,Monday


In [None]:
df.shape

(35040, 7)

In [None]:
df['CO2'].head()

date
2018-01-01 00:15:00    0.0
2018-01-01 00:30:00    0.0
2018-01-01 00:45:00    0.0
2018-01-01 01:00:00    0.0
2018-01-01 01:15:00    0.0
Name: CO2, dtype: float64

In [None]:
# check unique values in column
df['CO2'].unique()

array([0.  , 0.02, 0.03, 0.05, 0.06, 0.04, 0.01, 0.07])

In [None]:
# count values in column
df['CO2'].value_counts()

0.00    20990
0.02     4643
0.03     4261
0.04     1834
0.01     1512
0.05     1363
0.06      410
0.07       27
Name: CO2, dtype: int64

In [None]:
# pandas has many built in methods
df['CO2'].max()

0.07

In [None]:
# basic statistics
df.describe()

Unnamed: 0.1,Unnamed: 0,Usage,LagRP,LeadRP,CO2,LagPF,LeadPF
count,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0
mean,17519.5,27.386892,13.035384,3.870949,0.011524,80.578056,84.36787
std,10115.321053,33.44438,16.306,7.424463,0.016151,18.921322,30.456535
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8759.75,3.2,2.3,0.0,0.0,63.32,99.7
50%,17519.5,4.57,5.0,0.0,0.0,87.96,100.0
75%,26279.25,51.2375,22.64,2.09,0.02,99.0225,100.0
max,35039.0,157.18,96.91,27.76,0.07,100.0,100.0


In [None]:
# pairwise correlation of columns
df.corr()

Unnamed: 0,Usage,LagRP,LeadRP,CO2
Usage,1.0,0.89615,-0.324922,0.98818
LagRP,0.89615,1.0,-0.405142,0.886948
LeadRP,-0.324922,-0.405142,1.0,-0.332777
CO2,0.98818,0.886948,-0.332777,1.0


In [None]:
# create new column based on operation
df['Usage (MWh)'] = df['Usage'] / 1000
df.head()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Day,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:15:00,3.17,2.95,0.0,0.0,Monday,0.00317
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,Monday,0.004
2018-01-01 00:45:00,3.24,3.28,0.0,0.0,Monday,0.00324
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,Monday,0.00331
2018-01-01 01:15:00,3.82,4.5,0.0,0.0,Monday,0.00382


In [None]:
# select only rows based in condition
index_keep = df['CO2'] > 0
df_clean = df[index_keep]
df_clean.shape

(14050, 6)

In [None]:
# save back to csv
df_clean.to_csv('clean_data.csv')

### Working with time series

In [None]:
df.index

DatetimeIndex(['2018-01-01 00:15:00', '2018-01-01 00:30:00',
               '2018-01-01 00:45:00', '2018-01-01 01:00:00',
               '2018-01-01 01:15:00', '2018-01-01 01:30:00',
               '2018-01-01 01:45:00', '2018-01-01 02:00:00',
               '2018-01-01 02:15:00', '2018-01-01 02:30:00',
               ...
               '2018-12-31 21:45:00', '2018-12-31 22:00:00',
               '2018-12-31 22:15:00', '2018-12-31 22:30:00',
               '2018-12-31 22:45:00', '2018-12-31 23:00:00',
               '2018-12-31 23:15:00', '2018-12-31 23:30:00',
               '2018-12-31 23:45:00', '2019-01-01 00:00:00'],
              dtype='datetime64[ns]', name='date', length=35040, freq=None)

In [None]:
import datetime as dt # library to work with datetime

# select based on start date
start = dt.datetime(2018, 2, 1)

ind = df.index > start
df[ind].head()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Day,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-01 00:15:00,84.49,42.48,0.0,0.04,Thursday,0.08449
2018-02-01 00:30:00,77.22,42.23,0.0,0.04,Thursday,0.07722
2018-02-01 00:45:00,74.05,38.99,0.0,0.03,Thursday,0.07405
2018-02-01 01:00:00,88.96,46.04,0.0,0.04,Thursday,0.08896
2018-02-01 01:15:00,76.64,41.54,0.0,0.04,Thursday,0.07664


In [None]:
# resample with daily mean
df.resample("1d").mean().head()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01,3.667789,1.717474,9.989053,0.0,0.003668
2018-01-02,41.137917,16.446667,3.063125,0.0,0.041138
2018-01-03,37.094687,13.388542,2.766562,0.015938,0.037095
2018-01-04,51.850521,21.443438,2.339271,0.022604,0.051851
2018-01-05,48.790625,20.428542,2.689167,0.02125,0.048791


In [None]:
# resampling with monthly sum 
df.resample("M").sum()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-31,126178.28,54426.49,11675.81,53.96,126.17828
2018-02-28,91552.06,35922.42,11723.7,39.27,91.55206
2018-03-31,80233.0,32020.5,12761.92,33.64,80.233
2018-04-30,78769.37,34737.68,12091.03,33.35,78.76937
2018-05-31,79059.53,38430.2,12764.51,33.51,79.05953
2018-06-30,65404.6,32895.12,12388.1,27.1,65.4046
2018-07-31,81674.49,39674.85,9867.89,35.02,81.67449
2018-08-31,68559.32,38203.5,11109.77,28.44,68.55932
2018-09-30,57883.28,33196.77,9617.35,23.92,57.88328
2018-10-31,84664.54,49595.46,7430.39,35.81,84.66454


In [None]:
# rolling window
# moving average in one hour window
df_ma = df.rolling('1h').mean()
df_ma.head()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:15:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:30:00,3.585,3.705,0.0,0.0,0.003585
2018-01-01 00:45:00,3.47,3.563333,0.0,0.0,0.00347
2018-01-01 01:00:00,3.43,3.5625,0.0,0.0,0.00343
2018-01-01 01:15:00,3.5925,3.95,0.0,0.0,0.003593


In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Usage'].index, y=df['Usage'].values, mode='lines', name='Raw', line_shape='hv'))
fig.add_trace(go.Scatter(x=df_ma['Usage'].index, y=df_ma['Usage'].values, mode='lines', name='Moving Average', line_shape='hv'))

fig.show()

### Working with missing data

More details: https://pandas.pydata.org/docs/user_guide/missing_data.html

In [None]:
df_short = df.iloc[:10]
df_short

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Day,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:15:00,3.17,2.95,0.0,0.0,Monday,0.00317
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,Monday,0.004
2018-01-01 00:45:00,3.24,3.28,0.0,0.0,Monday,0.00324
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,Monday,0.00331
2018-01-01 01:15:00,3.82,4.5,0.0,0.0,Monday,0.00382
2018-01-01 01:30:00,3.28,3.56,0.0,0.0,Monday,0.00328
2018-01-01 01:45:00,3.6,4.14,0.0,0.0,Monday,0.0036
2018-01-01 02:00:00,3.6,4.28,0.0,0.0,Monday,0.0036
2018-01-01 02:15:00,3.28,3.64,0.0,0.0,Monday,0.00328
2018-01-01 02:30:00,3.78,4.72,0.0,0.0,Monday,0.00378


In [None]:
df_short = df_short.resample('10min').mean()
df_short

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:10:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:20:00,,,,,
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,0.004
2018-01-01 00:40:00,3.24,3.28,0.0,0.0,0.00324
2018-01-01 00:50:00,,,,,
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,0.00331
2018-01-01 01:10:00,3.82,4.5,0.0,0.0,0.00382
2018-01-01 01:20:00,,,,,
2018-01-01 01:30:00,3.28,3.56,0.0,0.0,0.00328
2018-01-01 01:40:00,3.6,4.14,0.0,0.0,0.0036


In [None]:
# drop rows with nan values
df_short.dropna()

Unnamed: 0_level_0,Unnamed: 0,Usage,LagRP,LeadRP,CO2,LagPF,LeadPF,Usage (MWh)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01 00:10:00,0.0,3.17,2.95,0.0,0.0,73.21,100.0,0.00317
2018-01-01 00:30:00,1.0,4.0,4.46,0.0,0.0,66.77,100.0,0.004
2018-01-01 00:40:00,2.0,3.24,3.28,0.0,0.0,70.28,100.0,0.00324
2018-01-01 01:00:00,3.0,3.31,3.56,0.0,0.0,68.09,100.0,0.00331
2018-01-01 01:10:00,4.0,3.82,4.5,0.0,0.0,64.72,100.0,0.00382
2018-01-01 01:30:00,5.0,3.28,3.56,0.0,0.0,67.76,100.0,0.00328
2018-01-01 01:40:00,6.0,3.6,4.14,0.0,0.0,65.62,100.0,0.0036
2018-01-01 02:00:00,7.0,3.6,4.28,0.0,0.0,64.37,100.0,0.0036
2018-01-01 02:10:00,8.0,3.28,3.64,0.0,0.0,66.94,100.0,0.00328
2018-01-01 02:30:00,9.0,3.78,4.72,0.0,0.0,62.51,100.0,0.00378


In [None]:
# fill nan by given value
df_short.fillna(value=0)

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:10:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:20:00,0.0,0.0,0.0,0.0,0.0
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,0.004
2018-01-01 00:40:00,3.24,3.28,0.0,0.0,0.00324
2018-01-01 00:50:00,0.0,0.0,0.0,0.0,0.0
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,0.00331
2018-01-01 01:10:00,3.82,4.5,0.0,0.0,0.00382
2018-01-01 01:20:00,0.0,0.0,0.0,0.0,0.0
2018-01-01 01:30:00,3.28,3.56,0.0,0.0,0.00328
2018-01-01 01:40:00,3.6,4.14,0.0,0.0,0.0036


In [None]:
# replace nan by last value
df_short.fillna(method='pad')

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:10:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:20:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,0.004
2018-01-01 00:40:00,3.24,3.28,0.0,0.0,0.00324
2018-01-01 00:50:00,3.24,3.28,0.0,0.0,0.00324
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,0.00331
2018-01-01 01:10:00,3.82,4.5,0.0,0.0,0.00382
2018-01-01 01:20:00,3.82,4.5,0.0,0.0,0.00382
2018-01-01 01:30:00,3.28,3.56,0.0,0.0,0.00328
2018-01-01 01:40:00,3.6,4.14,0.0,0.0,0.0036


In [None]:
# replace nan by interpolated value
df_short.interpolate()

Unnamed: 0_level_0,Usage,LagRP,LeadRP,CO2,Usage (MWh)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:10:00,3.17,2.95,0.0,0.0,0.00317
2018-01-01 00:20:00,3.585,3.705,0.0,0.0,0.003585
2018-01-01 00:30:00,4.0,4.46,0.0,0.0,0.004
2018-01-01 00:40:00,3.24,3.28,0.0,0.0,0.00324
2018-01-01 00:50:00,3.275,3.42,0.0,0.0,0.003275
2018-01-01 01:00:00,3.31,3.56,0.0,0.0,0.00331
2018-01-01 01:10:00,3.82,4.5,0.0,0.0,0.00382
2018-01-01 01:20:00,3.55,4.03,0.0,0.0,0.00355
2018-01-01 01:30:00,3.28,3.56,0.0,0.0,0.00328
2018-01-01 01:40:00,3.6,4.14,0.0,0.0,0.0036
