# Summary 

This notebook explores the summary statistics of the subset (top 1000+ companies) of the consolidated FMP daily prices dataset.

# Imports

In [1]:
import numpy as np
import pandas as pd
import pickle

from datetime import datetime

In [2]:
INPUT_PATH = "C:/Users/mushj/Downloads/PROCESSED FINANCE DATA/FMP"
OUTPUT_PATH = INPUT_PATH

In [3]:
%%time
df = pd.read_csv(INPUT_PATH+'/FMP_daily_prices_top1k.csv')

CPU times: total: 1.05 s
Wall time: 1.06 s


# Summary

In [4]:
df.head()

Unnamed: 0,symbol,date,close,volume
0,A,2005-01-03,16.09,3587208.0
1,A,2005-01-04,15.66,3978002.0
2,A,2005-01-05,15.66,4139634.0
3,A,2005-01-06,15.31,3353443.0
4,A,2005-01-07,15.3,2786175.0


In [5]:
%%time
print("Number of symbols:", df.symbol.nunique())
print("Number of dates:", df.date.nunique())

Number of symbols: 1010
Number of dates: 5033
CPU times: total: 234 ms
Wall time: 246 ms


In [6]:
%%time
# summary of NAs
df.isna().mean()

CPU times: total: 156 ms
Wall time: 179 ms


symbol    0.0
date      0.0
close     0.0
volume    0.0
dtype: float64

# Symbol-level metrics

In [7]:
%%time
# NAs by symbol
na_summary = df.groupby('symbol').agg(lambda x: x.isna().mean())

CPU times: total: 562 ms
Wall time: 589 ms


In [8]:
# show symbols with NAs
na_summary.query("(date > 0) | (close > 0) | (volume > 0)")

Unnamed: 0_level_0,date,close,volume
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [9]:
%%time
# compute date range and counts by symbol
len_summary = (
    df.groupby('symbol')
    .agg({'date': [lambda x: x.iloc[0], lambda x: x.iloc[-1], len]})
)
len_summary.columns = ['start', 'end', 'len']

CPU times: total: 344 ms
Wall time: 358 ms


In [10]:
print("Earliest start date:", len_summary['start'].min())
print("Latest start date:", len_summary['end'].max())
print("Earliest end date:", len_summary['start'].min())
print("Latest end date:", len_summary['end'].max())

Earliest start date: 2005-01-03
Latest start date: 2024-12-31
Earliest end date: 2005-01-03
Latest end date: 2024-12-31


In [11]:
len_summary.head(10)

Unnamed: 0_level_0,start,end,len
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2005-01-03,2024-12-31,5033
AA,2005-01-03,2024-12-31,5033
AAL,2005-09-27,2024-12-31,4848
AAON,2005-01-03,2024-12-31,5033
AAP,2005-01-03,2024-12-31,5033
AAPL,2005-01-03,2024-12-31,5033
ABBV,2012-12-10,2024-12-31,3035
ABNB,2020-12-10,2024-12-31,1020
ABT,2005-01-03,2024-12-31,5033
ACGL,2005-01-03,2024-12-31,5033


## Validations

In [12]:
# check that all dates are business days
dates = pd.date_range("2005-01-03", "2024-12-31", freq='B')
dates = [datetime.strftime(i, format='%Y-%m-%d') for i in dates]
df.query("date not in @dates").shape[0] == 0

True

In [13]:
# check for excluded business days in a sample symbol
# holidays, non-trading days etc.
aapl_dates = set(df.query("symbol == 'AAPL'").date)
diff = set(dates).difference(aapl_dates)
diff_md = [i[5:] for i in diff] # get month-day values
pd.DataFrame(np.unique(diff_md, return_counts=True)).T

Unnamed: 0,0,1
0,01-01,13
1,01-02,5
2,01-15,3
3,01-16,4
4,01-17,3
...,...,...
57,11-28,3
58,12-05,1
59,12-24,2
60,12-25,14


## Number of rows (length)

In [14]:
# distribution of symbol data length
len_summary['len'].describe(percentiles=np.arange(0,1.1,0.1))

count    1010.000000
mean     4088.551485
std      1487.501763
min        47.000000
0%         47.000000
10%      1368.500000
20%      2588.200000
30%      3827.700000
40%      5033.000000
50%      5033.000000
60%      5033.000000
70%      5033.000000
80%      5033.000000
90%      5033.000000
100%     5033.000000
max      5033.000000
Name: len, dtype: float64

In [15]:
# lower-tail distribution of symbol data length
len_summary['len'].describe(percentiles=np.arange(0,0.1,0.01))

count    1010.000000
mean     4088.551485
std      1487.501763
min        47.000000
0%         47.000000
1%        200.070000
2%        364.040000
3%        791.430000
4%        867.000000
5%        931.000000
6%       1018.540000
7%       1072.520000
8%       1135.160000
9%       1231.730000
50%      5033.000000
max      5033.000000
Name: len, dtype: float64

## Business days in date range (span)

In [16]:
%%time
# compute number of business days in date range of each symbol (span)
len_summary['span'] = len_summary.apply(lambda x: len(pd.date_range(x['start'], x['end'], freq='B')), axis=1)

# percentage of available data of total business days
len_summary['%'] = len_summary['len'] / len_summary['span']

CPU times: total: 1min 17s
Wall time: 1min 18s


In [17]:
# distribution of symbol span
len_summary['span'].describe(percentiles=np.arange(0,1.1,0.1))

count    1010.000000
mean     4247.683168
std      1534.920065
min        49.000000
0%         49.000000
10%      1449.400000
20%      2683.200000
30%      4017.800000
40%      5217.000000
50%      5217.000000
60%      5217.000000
70%      5217.000000
80%      5217.000000
90%      5217.000000
100%     5217.000000
max      5217.000000
Name: span, dtype: float64

In [18]:
# lower-tail distribution of symbol span
len_summary['span'].describe(percentiles=np.arange(0,0.1,0.01))

count    1010.000000
mean     4247.683168
std      1534.920065
min        49.000000
0%         49.000000
1%        229.900000
2%        401.540000
3%        829.270000
4%        899.360000
5%        966.800000
6%       1059.540000
7%       1116.890000
8%       1184.880000
9%       1317.860000
50%      5217.000000
max      5217.000000
Name: span, dtype: float64

In [19]:
# distribution of symbol % of available data (of all business days)
len_summary['%'].describe(percentiles=np.arange(0,1.1,0.1))

count    1010.000000
mean        0.962133
std         0.038505
min         0.039469
0%          0.039469
10%         0.964244
20%         0.964529
30%         0.964731
40%         0.964731
50%         0.964731
60%         0.964731
70%         0.964731
80%         0.964731
90%         0.964731
100%        0.973684
max         0.973684
Name: %, dtype: float64

In [20]:
# lower-tail distribution of symbol % available data
len_summary['%'].describe(percentiles=np.arange(0,0.1,0.01))

count    1010.000000
mean        0.962133
std         0.038505
min         0.039469
0%          0.039469
1%          0.962453
2%          0.962904
3%          0.963411
4%          0.963675
5%          0.963866
6%          0.963999
7%          0.964095
8%          0.964164
9%          0.964195
50%         0.964731
max         0.973684
Name: %, dtype: float64

# Remove low-data symbols

In [21]:
# get list of symbols that excludes bottom percentiles of availability (this ensures continuity in daily data)
# keep symbols with at least 365 days of data that spans at least two years
len_summary2 = (
    len_summary
    .query('(`%` >= 0.96) & (len >= 365) & (span >= 730)')
)
keep_list = list(len_summary2.index)

# store list in local storage
with open(OUTPUT_PATH+'/top1k_subset', 'wb') as f:
    pickle.dump(keep_list, f)

In [22]:
len_summary2

Unnamed: 0_level_0,start,end,len,span,%
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2005-01-03,2024-12-31,5033,5217,0.964731
AA,2005-01-03,2024-12-31,5033,5217,0.964731
AAL,2005-09-27,2024-12-31,4848,5026,0.964584
AAON,2005-01-03,2024-12-31,5033,5217,0.964731
AAP,2005-01-03,2024-12-31,5033,5217,0.964731
...,...,...,...,...,...
ZI,2020-06-04,2024-12-31,1152,1194,0.964824
ZION,2005-01-03,2024-12-31,5033,5217,0.964731
ZM,2019-04-18,2024-12-31,1436,1489,0.964406
ZS,2018-03-16,2024-12-31,1710,1773,0.964467


In [23]:
print("Earliest start date:", len_summary2['start'].min())
print("Latest start date:", len_summary2['end'].max())
print("Earliest end date:", len_summary2['start'].min())
print("Latest end date:", len_summary2['end'].max())

Earliest start date: 2005-01-03
Latest start date: 2024-12-31
Earliest end date: 2005-01-03
Latest end date: 2024-12-31


In [24]:
# distribution of symbol data length
len_summary2['len'].describe(percentiles=np.arange(0,1.1,0.1))

count     978.000000
mean     4198.877301
std      1364.299266
min       742.000000
0%        742.000000
10%      1675.500000
20%      2823.800000
30%      4418.900000
40%      5033.000000
50%      5033.000000
60%      5033.000000
70%      5033.000000
80%      5033.000000
90%      5033.000000
100%     5033.000000
max      5033.000000
Name: len, dtype: float64

In [25]:
# distribution of symbol span
len_summary2['span'].describe(percentiles=np.arange(0,1.1,0.1))

count     978.000000
mean     4352.625767
std      1413.886614
min       770.000000
0%        770.000000
10%      1737.500000
20%      2927.800000
30%      4579.900000
40%      5217.000000
50%      5217.000000
60%      5217.000000
70%      5217.000000
80%      5217.000000
90%      5217.000000
100%     5217.000000
max      5217.000000
Name: span, dtype: float64