In [21]:
from pathlib import Path
import pandas as pd
import numpy as np
import polars as pl
import duckdb
import os
import pandera as pa
import io

FROM_EXPERIMENT_CSV_IN = Path(r"E:\app_data\dropbox_13f_files\processed_tables\TR_02_EXP_SELECT_CIK_CSV")
TO_EXPERIMENT_PARQUET = Path(r"E:\app_data\dropbox_13f_files\processed_tables\TR_03_EXP_SELECT_CIK_PARQUET")

In [104]:

columns = ['cik', 'cusip8', 'cusip9','rdate', 'fdate','value', 'shares',\
           'address', 'form', 'shrsOrPrnAmt', 'putCall', 'nameOfIssuer', 'titleOfClass', 'type', 'dsource']

dtypes = {'cusip8': str, 'cusip9': str , 'titleOfClass': str, 'form': str,
          'putCall': str, 'shrsOrPrnAmt': str, 'value': float, 'shares': float, 
          'nameOfIssuer': str, 'cik' : pl.Int64, 'address': str, 'type': str,'num5': str,
          'deviation':str, 'shrout':str,'num3': str,'num2': str, 'num6':str,'num7': str,'num4': str,
           'votingAuthority': str, 'in_universe': str,'prc': str, 'split': str,
           'investmentDiscretion': str, 'rdate': str, 'fdate': str, 'dsource': str}

pd_dtypes = {'cusip8': str, 'cusip9': str , 'titleOfClass': str, 'form': 'category', 'putCall': 'category',
           'shrsOrPrnAmt': 'category', 'value': 'Int64', 'shares': 'Int64', 'type': 'category', 'nameOfIssuer': str,
           'cik' : 'int64', 'address': 'category',  'dsource': 'category'}

file = list(FROM_EXPERIMENT_CSV_IN.rglob(r"*0000036104-03-000131.csv"))[0]

for file in FROM_EXPERIMENT_CSV_IN.rglob("*0000036104-03-000131.csv"):
    schema = pl.scan_csv(file).schema
    read_cols = list(set(schema.keys()).intersection(columns))
    df = pl.read_csv(file, columns=read_cols, dtypes=dtypes)
    for col in columns:
        if col not in df.columns:
            df = df.with_column(pl.lit(None, dtype=dtypes[col]).alias(col))
    # df = df.select(columns)
    df = df.with_columns([pl.col("rdate").str.strptime(pl.Date, fmt="%Y%m%d"),
                        pl.col("fdate").str.strptime(pl.Date, fmt="%Y%m%d"),
                        pl.col("cusip8").str.to_uppercase(),
                        pl.col("cusip9").str.to_uppercase(),
                        pl.lit('dropbox').alias('dsource')]).select(columns)

df = df.to_pandas().astype(pd_dtypes)


In [97]:
# df.info()

In [111]:
df.query('index.isin([24, 80])').head(7)

# 0     26       1957109
# 1    183     718154107
# 2   1369     297659104
# 3   4145     362320103
# 4   4358      16962105
# 5   4373      67543101
# 6   4383      90078109

Unnamed: 0,cik,cusip8,cusip9,rdate,fdate,value,shares,address,form,shrsOrPrnAmt,putCall,nameOfIssuer,titleOfClass,type,dsource
24,36104,1850000000000,1.8500000000000002e+108,2003-09-30,2003-11-14,8,3000,36104/0000036104-03-000131.txt,13F-HR,,,,,fwf,dropbox
80,36104,7610000000000,7.61e+110,2003-09-30,2003-11-14,1,100,36104/0000036104-03-000131.txt,13F-HR,,,,,fwf,dropbox


In [99]:

basic_types_schema = pa.DataFrameSchema({
    "cik": pa.Column('int64'),
    "cusip8": pa.Column(str),
    "cusip9": pa.Column(str),
    "rdate": pa.Column("datetime64"),
    "fdate": pa.Column("datetime64"),
    "value": pa.Column("int64")
    
    
    })

In [None]:
basic_types_schema.validate(df[columns])

In [76]:
bad_types_schema = pa.DataFrameSchema({
    "cik": pa.Column('Int64'),
    "fdate": pa.Column("datetime64"),
    })



In [None]:
bad_types_schema.validate(df[columns])

In [118]:
# Pandera also allows validating value ranges for numerical columns
# value_range_schema = pa.DataFrameSchema({
#     "LotArea": pa.Column(int, pa.Check(lambda s: s <= 1000000), nullable=False),
#     "YearBuilt": pa.Column(int, pa.Check.in_range(1800, 2022)),
# })

value_range_schema = pa.DataFrameSchema({
    "cik": pa.Column('int64'),
    "cusip8": pa.Column(str, pa.Check(lambda s: s.str.len() <= 9)),
    "cusip9": pa.Column(str, pa.Check(lambda s: s.str.contains("+"))),
    "cusip9": pa.Column(str, pa.Check(lambda s: s.str.contains("-"))),
    "rdate": pa.Column("datetime64"),
    "fdate": pa.Column("datetime64"),
    "value": pa.Column("int64", pa.Check(lambda s: s <= 1000000), nullable=False)
                        })
value_range_schema.validate(df[columns])

SchemaError: <Schema Column(name=cusip8, type=DataType(str))> failed element-wise validator 0:
<Check <lambda>>
failure cases:
    index     failure_case
0      24    1850000000000
1      80    7610000000000
2     149       1.853E+023
3     204   25530000000000
4     212   26860000000000
..    ...              ...
68   4013  903330000000000
69   4073  913590000000000
70   4142  923430000000000
71   4194  929250000000000
72   4263      9.5988E+024

[73 rows x 2 columns]

In [107]:
txt = """
AB,CD,EF, JJ
foo,20160101,a,23
foo,20160102,a,34
foo,20160103,a,56
"""
lambda txt: len(txt) <= 10
# data = pl.read_csv(io.StringIO(txt))
# data = data.

<function __main__.<lambda>(txt)>

In [52]:
# different pandas dtypes

dtypes =    {
        "ID": str,
        "accessionNumber": str,
        "cikManager": "Int64",
        "periodOfReport": "datetime64[ns]",
        "report_Quarter": "Int64",
        "report_Year": "Int64",
        "submissionType": str,
        "isAmendment": bool,
        "amendmentType": str,
        "filedAsOfDate": "datetime64[ns]",
        "entryTotal": "Int64",
        "valueTotal": "float64",
        "cusip": str,
        "nameOfIssuer": str,
        "titleOfClass": str,
        "sharesValue": "float64",
        "sharesHeldAtEndOfQtr": "Int64",
        "securityType": str,
        "putCall": str,
        "xml_flag": str,
        "created_at": "datetime64[ns]",
        "edgar_path": str,
    }