In [1]:
import pandas as pd
import numpy as np
import nltk
import os
from pathlib2 import Path
import re
import shutil

In [2]:
os.getcwd()

'/Volumes/GoogleDrive/My Drive/Jotham/Personal Docs/ML for finance/SEC Sentiment Analysis/sec-sentiment/code'

In [3]:
def get_project_dir():
    try:
        project_dir = Path.cwd() / '/' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
    except:
        print('windows path not available')
    else:
        project_dir = Path.cwd() / '/' / 'Volumes' / 'GoogleDrive' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
    return project_dir

In [4]:
os.getcwd()

'/Volumes/GoogleDrive/My Drive/Jotham/Personal Docs/ML for finance/SEC Sentiment Analysis/sec-sentiment/code'

In [5]:
def clean_filing(input_filename, filing_type, output_filename):
    """
    Cleans a 10-K or 10-Q filing. All arguments take strings as input
    input_filename: name of the file to be cleaned
    filing_type: either 10-K or 10-Q
    outuput_filename: name of output file
    """
    
    # open file and get rid of all lines 
    with open (input_filename, 'r') as f:
        data = f.read().replace('\n', ' ')
    
    # get text in between the appropriate 10-K tags
    search_10k = re.search("(?s)(?m)<TYPE>{}.*?(</TEXT>)".format(filing_type), data)
    try:
        data_processed = search_10k.group(0)
    
        # delete formatting text used to identify 10-K section as its not relevant
        data_processed = re.sub(pattern="((?i)<TYPE>).*?(?=<)", repl='', string=data_processed)

        # Five more formatting tags are deleted
        data_processed = re.sub(pattern="((?i)<SEQUENCE>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="((?i)<FILENAME>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="((?i)<DESCRIPTION>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="(?s)(?i)<head>.*?</head>", repl='', string=data_processed)
        data_processed = re.sub(pattern="(?s)(?i)<(table).*?(</table>)", repl='', string=data_processed)

        # Tags each section of the financial statement with prefix '°Item' for future analysis
        data_processed = re.sub(pattern="(?s)(?i)(?m)> +Item|>Item|^Item", repl=">Â°Item", string=data_processed, count=0)

        # Removes all HTML tags
        data_processed = re.sub(pattern="(?s)<.*?>", repl=" ", string=data_processed, count=0)

        # Replaces all Unicode strings
        data_processed = re.sub(pattern="&(.{2,6});", repl=" ", string=data_processed, count=0)

        # Replaces multiple spaces with a single space
        data_processed = re.sub(pattern="(?s) +", repl=" ", string=data_processed, count=0)

        with open(output_filename + str('.txt'), 'w') as output:
            output.write(data_processed)
            
    except BaseException as e:
        print('{} could not be cleaned. Exception: {}'.format(input_filename, e))
        pass

In [6]:
def clean_all_filings():
    """Clean all filings in sec-filings directory"""
    
    project_dir = get_project_dir()
    os.chdir(os.path.join(project_dir, 'sec-filings-downloaded'))
    company_dir_list = os.listdir()  

    for company in company_dir_list:
        company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
        os.chdir(company_dir) # abs path to each company directory
        
        print('***Cleaning: {}***'.format(company))
        for file in os.listdir():  # iterate through all files in the respective company directory
            
            # cleaning files
            if file.startswith('clean'): continue
            if file.endswith('10-K'): filing_type = '10-K'
            else: filing_type = '10-Q'
            if file.endswith('10-K') or file.endswith('10-Q'):
                clean_filing(input_filename=file, filing_type=filing_type, output_filename='cleaned_' + str(file))
                print('{} filing cleaned'.format(file))
                
        
        # renaming 10-Q files to include filing quarter
        for file in os.listdir():
            if file.startswith('clean') and file.endswith('10-Q.txt'):
                get_date = file[8:18]
                get_year = file[8:12]
                get_month = int(file[13:15])

                if get_month >= 1 and get_month <= 5:
                    filing_quarter = 'Q1'
                elif get_month >= 6 and get_month <= 8:
                    filing_quarter = 'Q2'
                else:
                    filing_quarter = 'Q3'
                
                os.rename(file, ('cleaned_'+str(filing_quarter)+'_'+str(get_date)+'_'+'10-Q.txt'))

                
            

In [7]:
def move_files_to_10k_10q_folders():
    """Move filings to the appropriate folders in each company directory"""
    
    project_dir = get_project_dir()
    os.chdir(os.path.join(project_dir, 'sec-filings-downloaded'))
    company_dir_list = os.listdir()  

    for company in company_dir_list:
        company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
        os.chdir(company_dir) # abs path to each company directory    
    
    # make 10-K and 10-Q directories
        ten_k_dir = os.path.join(project_dir, 'sec-filings-downloaded', company, 'cleaned_10-K')
        ten_q_dir = os.path.join(project_dir, 'sec-filings-downloaded', company, 'cleaned_10-Q')        
        if not os.path.exists(ten_k_dir): os.makedirs(ten_k_dir)
        if not os.path.exists(ten_q_dir): os.makedirs(ten_q_dir)
                
        for file in os.listdir():
            if file.startswith('cleaned') and file.endswith('10-K.txt'):
                shutil.move(os.path.join(company_dir, file), os.path.join(ten_k_dir, file))
            if file.startswith('cleaned') and file.endswith('10-Q.txt'):
                shutil.move(os.path.join(company_dir, file), os.path.join(ten_q_dir, file))

In [8]:
clean_all_filings()

***Cleaning: MEDALLION FINANCIAL CORP***
***Cleaning: OPTICAL CABLE CORP***
2018-12-19_10-K filing cleaned
2018-09-11_10-Q filing cleaned




2018-06-11_10-Q filing cleaned
2018-03-13_10-Q filing cleaned
2017-09-12_10-Q filing cleaned
2017-12-20_10-K filing cleaned
2017-06-13_10-Q filing cleaned
2016-06-07_10-Q filing cleaned
2016-03-14_10-Q filing cleaned
2016-01-28_10-K filing cleaned
2016-12-20_10-K filing cleaned
2017-03-08_10-Q filing cleaned
2016-09-13_10-Q filing cleaned
***Cleaning: NICHOLAS FINANCIAL INC***
2018-11-14_10-Q filing cleaned
2018-08-14_10-Q filing cleaned
2018-02-09_10-Q filing cleaned
2018-06-27_10-K filing cleaned
2017-08-09_10-Q filing cleaned
2017-11-09_10-Q filing cleaned
2017-06-14_10-K filing cleaned
2017-02-09_10-Q filing cleaned
2016-11-09_10-Q filing cleaned
2016-08-09_10-Q filing cleaned
2016-06-14_10-K filing cleaned
2016-02-09_10-Q filing cleaned
***Cleaning: CORE LABORATORIES N V (1)***
2018-10-25_10-Q filing cleaned
2018-07-27_10-Q filing cleaned
2018-02-12_10-K filing cleaned
2018-04-27_10-Q filing cleaned
2017-07-26_10-Q filing cleaned
2017-10-25_10-Q filing cleaned
2017-04-21_10-Q fili

In [9]:
move_files_to_10k_10q_folders()