# Summary 

This notebook explores the summary statistics of the entire consolidated FMP daily prices dataset.

# Imports

In [1]:
import pandas as pd

In [2]:
INPUT_PATH = r"C:\Users\mushj\Downloads\PROCESSED FINANCE DATA\FMP\FMP_daily_prices.csv"

In [3]:
%%time
df = pd.read_csv(INPUT_PATH)

CPU times: total: 1min 20s
Wall time: 2min 13s


# Summary

In [4]:
df.head()

Unnamed: 0,symbol,date,close,volume
0,.ALPHAUSD,2022-03-16,170.57726,75733.0
1,.ALPHAUSD,2022-03-17,174.45331,8536.0
2,.ALPHAUSD,2022-03-18,157.09172,12569.0
3,.ALPHAUSD,2022-03-21,152.50084,1414.0
4,.ALPHAUSD,2022-03-22,154.08307,15990.0


In [5]:
%%time
print("Number of symbols:", df.symbol.nunique())
print("Number of dates:", df.date.nunique())

Number of symbols: 89366
Number of dates: 5364
CPU times: total: 15.7 s
Wall time: 18.5 s


In [6]:
%%time
# summary of NAs
df.isna().mean()

CPU times: total: 13.3 s
Wall time: 21 s


symbol    2.591257e-06
date      0.000000e+00
close     0.000000e+00
volume    2.745152e-07
dtype: float64

# Symbol-level metrics

In [8]:
%%time
# NAs by symbol
na_summary = df.groupby('symbol').agg(lambda x: x.isna().mean())

CPU times: total: 57.3 s
Wall time: 1min 38s


In [10]:
# show symbols with NAs
na_summary.query("(date > 0) | (close > 0) | (volume > 0)")

Unnamed: 0_level_0,date,close,volume
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AD.MI,0.0,0.0,0.019242


In [15]:
%%time
# compute date range and counts by symbol
len_summary = (
    df.groupby('symbol')
    .agg({'date': [lambda x: x.iloc[0], lambda x: x.iloc[-1], len]})
)

CPU times: total: 34.5 s
Wall time: 54 s


In [19]:
print("Earliest start date:", len_summary.iloc[:,0].min())
print("Latest start date:", len_summary.iloc[:,0].max())
print("Earliest end date:", len_summary.iloc[:,1].min())
print("Latest end date:", len_summary.iloc[:,1].max())

Earliest start date: 2005-01-03
Latest start date: 2024-12-26
Earliest end date: 2005-09-09
Latest end date: 2024-12-31


In [21]:
len_summary[('date', 'len')].describe()

count    89366.000000
mean      2690.320905
std       1764.895878
min          1.000000
25%        978.000000
50%       2535.000000
75%       4615.000000
max       5217.000000
Name: (date, len), dtype: float64