# Summary 

This notebook explores the summary statistics of the subset (top 1000+ companies) of the consolidated FMP daily prices dataset.

# Imports

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
INPUT_PATH = "C:/Users/mushj/Downloads/PROCESSED FINANCE DATA/FMP"
OUTPUT_PATH = INPUT_PATH

In [3]:
%%time
df = pd.read_csv(INPUT_PATH+'/FMP_daily_prices_top1k.csv')

CPU times: total: 1.02 s
Wall time: 1.04 s


# Summary

In [4]:
df.head()

Unnamed: 0,symbol,date,close,volume
0,A,2005-01-03,16.09,3587208.0
1,A,2005-01-04,15.66,3978002.0
2,A,2005-01-05,15.66,4139634.0
3,A,2005-01-06,15.31,3353443.0
4,A,2005-01-07,15.3,2786175.0


In [5]:
%%time
print("Number of symbols:", df.symbol.nunique())
print("Number of dates:", df.date.nunique())

Number of symbols: 1010
Number of dates: 5033
CPU times: total: 188 ms
Wall time: 242 ms


In [6]:
%%time
# summary of NAs
df.isna().mean()

CPU times: total: 141 ms
Wall time: 176 ms


symbol    0.0
date      0.0
close     0.0
volume    0.0
dtype: float64

# Symbol-level metrics

In [7]:
%%time
# NAs by symbol
na_summary = df.groupby('symbol').agg(lambda x: x.isna().mean())

CPU times: total: 453 ms
Wall time: 561 ms


In [8]:
# show symbols with NAs
na_summary.query("(date > 0) | (close > 0) | (volume > 0)")

Unnamed: 0_level_0,date,close,volume
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [9]:
%%time
# compute date range and counts by symbol
len_summary = (
    df.groupby('symbol')
    .agg({'date': [lambda x: x.iloc[0], lambda x: x.iloc[-1], len]})
)
len_summary.columns = ['start', 'end', 'len']

CPU times: total: 312 ms
Wall time: 318 ms


In [10]:
print("Earliest start date:", len_summary['start'].min())
print("Latest start date:", len_summary['end'].max())
print("Earliest end date:", len_summary['start'].min())
print("Latest end date:", len_summary['end'].max())

Earliest start date: 2005-01-03
Latest start date: 2024-10-24
Earliest end date: 2024-12-27
Latest end date: 2024-12-31


In [17]:
# distribution of symbol data length
len_summary['len'].describe(percentiles=np.arange(0,1.1,0.1))

count    1010.000000
mean     4088.551485
std      1487.501763
min        47.000000
0%         47.000000
10%      1368.500000
20%      2588.200000
30%      3827.700000
40%      5033.000000
50%      5033.000000
60%      5033.000000
70%      5033.000000
80%      5033.000000
90%      5033.000000
100%     5033.000000
max      5033.000000
Name: (date, len), dtype: float64

In [26]:
# compute number of business days in date range of each symbol
len_summary['span'] = len_summary.apply(lambda x: len(pd.date_range(x['start'], x['end'], freq='B')), axis=1)

# percentage of available data of total business days
len_summary['%'] = len_summary['len'] / len_summary['span']

In [29]:
# distribution of symbol % available data
len_summary['%'].describe(percentiles=np.arange(0,1.1,0.1))

count    1010.000000
mean        0.962133
std         0.038505
min         0.039469
0%          0.039469
10%         0.964244
20%         0.964529
30%         0.964731
40%         0.964731
50%         0.964731
60%         0.964731
70%         0.964731
80%         0.964731
90%         0.964731
100%        0.973684
max         0.973684
Name: %, dtype: float64

In [30]:
# lower-tail distribution of symbol % available data
len_summary['%'].describe(percentiles=np.arange(0,0.1,0.01))

count    1010.000000
mean        0.962133
std         0.038505
min         0.039469
0%          0.039469
1%          0.962453
2%          0.962904
3%          0.963411
4%          0.963675
5%          0.963866
6%          0.963999
7%          0.964095
8%          0.964164
9%          0.964195
50%         0.964731
max         0.973684
Name: %, dtype: float64

In [42]:
# get list of symbols that excludes bottom percentiles of symbols
keep_list = list(len_summary.query('`%` >= 0.96').index)

with open(OUTPUT_PATH+'/top1k_subset', 'wb') as f:
    pickle.dump(keep_list, f)