## 3.1 値の演算

In [1]:
import pandas as pd
import numpy as np

def load_3_1():
    df = pd.read_csv(
        'sample_3_1.csv',
        dtype={
            'val1': 'float64',
            'val2': 'float64'
        },
        index_col=0
    )
    return df

df = load_3_1()
df

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,148.083426,481.382983
2010-01-02,117.490787,466.259767
2010-01-03,147.009196,474.221712
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [2]:
df.iat[0, 0] += 2.
df.iat[1, 0] -= 2.
df.iat[2, 0] *= 2.
df.iat[3, 0] /= 2.

df.iat[0, 1] %= 2.
df.iat[1, 1] //= 2.
df.iat[2, 1] **= 2.

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,150.083426,1.382983
2010-01-02,115.490787,233.0
2010-01-03,294.018392,224886.232291
2010-01-04,75.043491,493.2394
2010-01-05,162.70333,491.120699


In [3]:
df['val1'] *= 1000
df.iloc[1:4, 1] /= 1000

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,150083.425892,1.382983
2010-01-02,115490.786953,0.233
2010-01-03,294018.392446,224.886232
2010-01-04,75043.490626,0.493239
2010-01-05,162703.330189,491.120699


In [4]:
df = load_3_1()

In [5]:
%%timeit

for i in range(len(df)):
    df.iloc[i, 0] /= 1000

288 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df = load_3_1()

In [7]:
%%timeit

df.iloc[:, 0] /= 1000

385 µs ± 22.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
df.iat[0, 0] = 0
df.iloc[1:4, 1] = 0

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,0.0,481.382983
2010-01-02,0.0,0.0
2010-01-03,0.0,0.0
2010-01-04,0.0,0.0
2010-01-05,0.0,491.120699


In [9]:
df.iloc[1:4, :] = np.array([
    [1, 2], [3, 4], [5, 6]
])

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,0.0,481.382983
2010-01-02,1.0,2.0
2010-01-03,3.0,4.0
2010-01-04,5.0,6.0
2010-01-05,0.0,491.120699


In [10]:
df = load_3_1()
boolean_mask = df['val1'] < 150

boolean_mask

date
2010-01-01     True
2010-01-02     True
2010-01-03     True
2010-01-04    False
2010-01-05    False
              ...  
2020-12-27    False
2020-12-28    False
2020-12-29    False
2020-12-30    False
2020-12-31    False
Name: val1, Length: 4018, dtype: bool

In [11]:
df_mask = df.mask(boolean_mask, -1)

df_mask

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,-1.000000,-1.000000
2010-01-02,-1.000000,-1.000000
2010-01-03,-1.000000,-1.000000
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [12]:
df_mask = df.mask(boolean_mask)

df_mask

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,,
2010-01-02,,
2010-01-03,,
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [13]:
df_where = df.where(boolean_mask, -1)   # df.mask(~boolean_mask, -1)と同じ

df_where

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,148.083426,481.382983
2010-01-02,117.490787,466.259767
2010-01-03,147.009196,474.221712
2010-01-04,-1.000000,-1.000000
2010-01-05,-1.000000,-1.000000
...,...,...
2020-12-27,-1.000000,-1.000000
2020-12-28,-1.000000,-1.000000
2020-12-29,-1.000000,-1.000000
2020-12-30,-1.000000,-1.000000


In [14]:
df = load_3_1()

df['feat1'] = df['val1'] * df['val2']

df.head()

Unnamed: 0_level_0,val1,val2,feat1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,148.083426,481.382983,71284.841343
2010-01-02,117.490787,466.259767,54781.226971
2010-01-03,147.009196,474.221712,69714.952737
2010-01-04,150.086981,493.2394,74028.812646
2010-01-05,162.70333,491.120699,79906.973278


In [15]:
df['feat2'] = np.log(df['val2'])

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663
2010-01-02,117.490787,466.259767,54781.226971,6.144743
2010-01-03,147.009196,474.221712,69714.952737,6.161675
2010-01-04,150.086981,493.2394,74028.812646,6.200995
2010-01-05,162.70333,491.120699,79906.973278,6.19669


In [16]:
df['feat3'] = df['val1'].shift(1)
df['feat4'] = df['val1'].shift(2)

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196


In [17]:
df['feat5'] = df['val1'] - df['val1'].shift(1)

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4,feat5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,,
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,,-30.592639
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426,29.518409
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787,3.077785
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196,12.616349


In [18]:
df['feat6'] = df['val2'].cumsum()

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4,feat5,feat6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,,,481.382983
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,,-30.592639,947.642751
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426,29.518409,1421.864463
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787,3.077785,1915.103863
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196,12.616349,2406.224562


In [19]:
df = load_3_1()
df['smooth_val1'] = df['val1'].rolling(3).mean()
df['smooth_val2'] = df['val2'].rolling(3, center=True).mean()

df

Unnamed: 0_level_0,val1,val2,smooth_val1,smooth_val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,148.083426,481.382983,,
2010-01-02,117.490787,466.259767,,473.954821
2010-01-03,147.009196,474.221712,137.527803,477.906960
2010-01-04,150.086981,493.239400,138.195655,486.193937
2010-01-05,162.703330,491.120699,153.266503,491.101692
...,...,...,...,...
2020-12-27,1029.031359,3521.016894,1011.627288,3483.898095
2020-12-28,1026.100656,3593.560824,1019.780682,3513.149032
2020-12-29,1002.146012,3424.869380,1019.092676,3487.815107
2020-12-30,1014.642041,3445.015116,1014.296236,3454.476891


In [20]:
df = load_3_1()
df_agg = df.agg(['sum', 'mean', 'median', 'min', 'max'])

df_agg

Unnamed: 0,val1,val2
sum,2310718.0,5013009.0
mean,575.0916,1247.638
median,573.4696,1052.429
min,38.6814,329.6911
max,1114.738,3593.561
