In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()
spark

### Read in the merged dataset

In [4]:
df = spark.read.parquet('s3://sec-finc/annual_statement_data_v1/')

In [5]:
df.show(10)

+-------+--------------------+------------+--------------------+-----------+--------------------------+------------+--------------------+--------------------+----------+----------+------------+------------+-------------+--------+----+---------------------+--------------+--------------------------+------------+-------------+------------+---------+
|    cik|        company_name|assigned_sic|accession_number_int|filing_date|document_fiscal_year_focus|datapoint_id|      datapoint_name|     datapoint_label|start_date|  end_date|period_month|string_value|numeric_value|decimals|unit|parent_datapoint_name|statement_type|report_section_description|     version|segment_label|segment_hash|footnotes|
+-------+--------------------+------------+--------------------+-----------+--------------------------+------------+--------------------+--------------------+----------+----------+------------+------------+-------------+--------+----+---------------------+--------------+--------------------------+----

In [8]:
#df.write.parquet("s3://sec-finc/annual_statement_data_v3")

In [6]:
df.cache()

DataFrame[cik: int, company_name: string, assigned_sic: int, accession_number_int: bigint, filing_date: date, document_fiscal_year_focus: int, datapoint_id: bigint, datapoint_name: string, datapoint_label: string, start_date: date, end_date: date, period_month: bigint, string_value: string, numeric_value: double, decimals: int, unit: string, parent_datapoint_name: string, statement_type: string, report_section_description: string, version: string, segment_label: string, segment_hash: string, footnotes: string]

### select 48 features as we needed

In [7]:
df.createOrReplaceTempView('SelectedDF')

In [8]:
SelectedDF1 = spark.sql('''
    SELECT *
    FROM SelectedDF
    WHERE datapoint_name in ('CashAndCashEquivalentsAtCarryingValue', 'NetIncomeLoss', 'OperatingIncomeLoss', 'Revenues', 
    'SalesRevenueNet ', 'CostOfRevenue ', 'EarningsPerShareBasic', 'EarningsPerShareDiluted', 
    'NetCashProvidedByUsedInOperatingActivities', 'NetCashProvidedByUsedInFinancingActivities', 
    'NetCashProvidedByUsedInInvestingActivities', 'NetCashProvidedByUsedInOperatingActivitiesContinuingOperations', 
    'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations', 'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations', 
    'ShareBasedCompensation', 'PaymentsToAcquirePropertyPlantAndEquipment', 'OperatingExpenses', 'GeneralAndAdministrativeExpense', 
    'SellingGeneralAndAdministrativeExpense ', 'SellingAndMarketingExpense', 'IncomeTaxesPaid', 'ResearchAndDevelopmentExpense', 
    'PaymentsForRepurchaseOfCommonStock', 'CostOfGoodsSold', 'CostOfGoodsAndServicesSold', 'CostOfServices', 'RepaymentsOfLongTermDebt', 
    'PaymentsToAcquireBusinessesNetOfCashAcquired', 'PaymentsOfDividendsCommonStock', 'PaymentsOfDividends', 'LaborAndRelatedExpense', 
    'PaymentsOfFinancingCosts', 'IncreaseDecreaseInAccountsReceivable', 'AccountsReceivableNetCurrent', 'IncreaseDecreaseInInventories', 
    'IncreaseDecreaseInAccruedLiabilities', 'IncreaseDecreaseInAccountsPayable', 'LiabilitiesCurrent', 'Liabilities', 
    'AccountsPayableCurrent', 'StockholdersEquity', 'Assets', 'AssetsCurrent', 'GainLossOnDispositionOfAssets', 
    'CommonStockValue', 'PreferredStockValue', 'Goodwill', 'PropertyPlantAndEquipmentNet')
''')

In [9]:
SelectedDF1.show(10)

+-------+--------------------+------------+--------------------+-----------+--------------------------+------------+--------------------+--------------------+----------+----------+------------+------------+-------------+--------+----+---------------------+--------------+--------------------------+------------+-------------+------------+---------+
|    cik|        company_name|assigned_sic|accession_number_int|filing_date|document_fiscal_year_focus|datapoint_id|      datapoint_name|     datapoint_label|start_date|  end_date|period_month|string_value|numeric_value|decimals|unit|parent_datapoint_name|statement_type|report_section_description|     version|segment_label|segment_hash|footnotes|
+-------+--------------------+------------+--------------------+-----------+--------------------------+------------+--------------------+--------------------+----------+----------+------------+------------+-------------+--------+----+---------------------+--------------+--------------------------+----

### save the dataset with selected features

In [3]:
SelectedDF.write.parquet('s3://sec-finc/uploaded/')

### check final number of companies

In [16]:
SelectedDF.createOrReplaceTempView('SelectedDF')
comp = spark.sql('''
SELECT COUNT (DISTINCT company_name)
FROM SelectedDF
''')

In [17]:
comp.show()

+----------------------------+
|count(DISTINCT company_name)|
+----------------------------+
|                        8293|
+----------------------------+

