In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [3]:
pg_df = pd.DataFrame({'report_date':['2021-01-02','','2021-01-01', '2021-03-04'],
                      'dropoff_site': ['MRF', 'MRF', 'MRF','Organics'],
                      'load_weight': [1,3,2,6],
                      'load_type': ['Cheese', 'Mac', 'Green', 'Wholesale']})

In [4]:
# mean, median, mode calculate only rows of each group.
pg_df.loc[pg_df['dropoff_site'] == 'MRF',['dropoff_site','load_weight']].groupby(['dropoff_site']).agg(["mean", "median", ("mode", lambda x: x.mode().tolist())])

Unnamed: 0_level_0,load_weight,load_weight,load_weight
Unnamed: 0_level_1,mean,median,mode
dropoff_site,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
MRF,2.0,2.0,"[1, 2, 3]"


In [5]:
# groupby auto sort the column.
pg_df[['report_date','load_weight']].groupby(['report_date']).sum()

Unnamed: 0_level_0,load_weight
report_date,Unnamed: 1_level_1
,3
2021-01-01,2
2021-01-02,1
2021-03-04,6


In [6]:
pg_df.loc[pg_df['dropoff_site'] == 'MRF','load_weight']

0    1
1    3
2    2
Name: load_weight, dtype: int64

In [7]:
# What is quantile?
pg_df.loc[pg_df['dropoff_site'] == 'MRF','load_weight'].quantile([.25,.50,.75])

0.25    1.5
0.50    2.0
0.75    2.5
Name: load_weight, dtype: float64

In [8]:
# drop row with specific condition
pg_df.drop(pg_df.loc[(pg_df['dropoff_site'] == 'MRF') 
                     & (pg_df['load_type'] == 'Mac') 
                     & ((pg_df['load_weight'] > 3) | (pg_df['load_weight'] < 2))
                    ].index)

Unnamed: 0,report_date,dropoff_site,load_weight,load_type
0,2021-01-02,MRF,1,Cheese
1,,MRF,3,Mac
2,2021-01-01,MRF,2,Green
3,2021-03-04,Organics,6,Wholesale


In [9]:
# conditional
x = -0.6

if (x > 0.5) | (x < -0.5):
    print("skewed")

skewed


In [10]:
from enum import Enum
from typing import Literal

class STRATEGY(Enum):
    STANDARD_DEVIATION = 0
    IQR = 1

def getLowerUpperLimit(dropoff_site: str, load_type: str, strategy: STRATEGY) -> tuple[float, float]:
    if (strategy == STRATEGY.IQR):
        print("IQR")
    else: ...
    print("ok")
    return 10, 10

lower_limit, upper_limit = getLowerUpperLimit('MRF', 'Organics', STRATEGY.IQR)
lower_limit, upper_limit

IQR
ok


(10, 10)

In [11]:
pd.DataFrame([1,4,3,np.nan]).skew()

0   -0.93522
dtype: float64

# confidence interval

In [12]:
pg_df = pd.DataFrame([10,9,9,10,11,9,8])

In [13]:
pg_df.agg(['mean', 'std', 'var'])

Unnamed: 0,0
mean,9.428571
std,0.9759
var,0.952381


H0: μ <= 9.0

H1: μ > 9.0

Since our p-value is less than 0.05, so we can reject the null hypothesis and we can conclude that, the mean population is higher than 9.0

In [14]:
statistic, pvalue = stats.ttest_1samp(a = pg_df[0], popmean=9.0, alternative="greater")
statistic, pvalue

(1.1618950038622258, 0.14470161242339485)

In [15]:
# 2021-01-05 11:00:00+00:00
pg_df = pd.DataFrame({'load_time': ['2021-01-01 11:00:00+00:00','2021-01-02 11:00:00+00:00','2021-01-03 12:00:00+00:00']})

pg_df['load_time'] = pd.to_datetime(pg_df['load_time'])

pg_df['previous_load_time'] = pg_df['load_time'].shift()
pg_df['between_load_time'] = pg_df['load_time'] - pg_df['previous_load_time']
pg_df['between_load_time'].mean()

Timedelta('1 days 00:30:00')

In [16]:
pg_df = pd.Series([1,1,1,100])
mean, std = pg_df.agg(['mean', 'std'])
print(mean - 3 * std, mean + 3 * std)

q1, q3 = pg_df.quantile([.25,.75])
iqr = q3 - q1
print(q1 - iqr * 1.5, q3 + iqr * 1.5)

-122.75 174.25
-36.125 62.875


In [108]:
pg_df = pd.DataFrame({'report_date': ['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04'],
                      'load_weight': [2,3,4,1]})
# w = f(t)
# v = dw = f'(t)
#     --
#     dt
y = pg_df.loc[:,['report_date', 'load_weight']].groupby('report_date').sum()

y['cumulative_load_weight'] = y['load_weight'].cumsum()

dx = [0]

for i in range(1, len(pg_df.index)):
    dx.append(dx[i-1] + i)
    
v = np.diff(y['cumulative_load_weight']) / np.diff(dx)

w_besok = y.iloc[-1]['load_weight'] + v[-1] * 1
print("whari ini: {0:,.2f}, wbesok: {1:,.2f}".format(y.iloc[-1]['load_weight'], w_besok))

whari ini: 1.00, wbesok: 1.33
