# Libraries

In [1]:
import os
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
import re
import plotly.plotly as py
import plotly.graph_objs as go
import requests
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import nltk
from nltk import word_tokenize, everygrams
from nltk.util import ngrams

# Obtain data

## Function Definitions

In [2]:
def single_read_in_text(file_path,file_name):
    '''
    input:
    file_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    file_name: name of the file e.g 21centfoxinc_sec_files.csv
    
    output:
    a sigle pandas dataframe with all the original columns from the input file + a column for the 'cleaned' data
    
    dev:
    1) can add more output columns for features
    2) can further clean the data 
    
    '''
    
    ## read in single csv to pandas 
    individule_csv = pd.read_csv(file_path+"/"+file_name)
    raw_texts = []
    clean_texts = []
    
    ## create a connection with the url link and readin the raw file
    for url in individule_csv['sec_full_path']:
        print(url)
        raw_texts.append([url,request.urlopen(url).read().decode('utf8')])
    
    ## clean the raw file by:
    ## 1. remove html tags 
    ## 2. break the text by "\n"
    ## 3. remove the spaces in the front of and after each "\n"
    
    for raw_file in raw_texts:
        
        
        
        clean_texts.append([raw_file[0],
                            '%%'.join(list(filter(None,
                                                 [re.sub('[\t]+', ' ', i.strip()) for 
                                                  i in BeautifulSoup(raw_file[1], "lxml").text.split('\n')])))])
        ## more columns, features, data cleanings can be put here
    
    ## merge back to the original read in dataframe 
    clean_texts_df = pd.DataFrame(clean_texts)
    clean_texts_df.columns = ['sec_full_path', 'text']
    merged_df = pd.merge(left = individule_csv, right = clean_texts_df, on = 'sec_full_path')
    
    ## add one more column to indicate the file name 
    merged_df['file_name'] = file_name
    return merged_df
def folder_read_in_text(folder_path, ext = '.csv'):
    '''
    input: 
    folder_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    ext: extension of the files that are interested, default to be .csv 
    
    output:
    a sigle pandas dataframe with all the original columns from all the input files inside the folder
    + a column for the 'cleaned' data
    + a column for the file name 
    
    Utilize the single_read_in_text function 
    '''
    file = []
    direc = folder_path 

    # Select only files with the ext extension
    txt_files = [i for i in os.listdir(direc) if os.path.splitext(i)[1] == ext]
    temp_df = pd.DataFrame()
    
    ## Utilize the single_read_in_text function to process data

    for i in txt_files:
        temp_df = temp_df.append(single_read_in_text(file_path = folder_path,file_name = i), ignore_index=True)
    return temp_df

## Function Call

In [None]:
#single_read_in_text('C:/Users/li haoran/Desktop/New folder','21centfoxinc_sec_files.csv')
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/Inputs',
                                   ext = '.csv')

## Write Output

In [None]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data.csv', sep='|', encoding='utf-8')

# Further processing 

## Read documents

In [3]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## Further clean the data

In [4]:
words = set(nltk.corpus.words.words())
cleaned_data['only_eng_words'] = cleaned_data['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) \
                                      if w.lower() in words))
cleaned_data['evy_gram_1_3'] =cleaned_data['only_eng_words'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 4)])

In [7]:
cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()

Unnamed: 0_level_0,file_name,doccount,doccount
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,21centfoxinc_sec_files.csv,1048,3978.0
1,attinc_sec_files.csv,555,3383.0
2,cbscorp_sec_files.csv,385,2578.0
3,comcastcorp_sec_files.csv,277,2735.0
4,verizoncom_sec_files.csv,525,3319.0


## Write Output

In [None]:
cleaned_data.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated.csv', sep='|', encoding='utf-8')

In [9]:
cleaned_data[cleaned_data['file_name']=='21centfoxinc_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_21centfoxinc_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='attinc_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_attinc_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='cbscorp_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_cbscorp_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='comcastcorp_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_comcastcorp_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='verizoncom_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_verizoncom_sec_files.csv', sep='|', encoding='utf-8')

# Analysis

## Read documents

In [None]:
## Full data
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

In [2]:
## cleaned_data_updated_verizoncom_sec_files.csv
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_verizoncom_sec_files.csv'
cleaned_verizon_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## EDA

In [None]:
table1 = cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()
print(list(table1))
table1

In [None]:
trace1 = go.Bar(x = table1['file_name'], 
                y = table1['doccount']['count'],
                name = 'filing count')

trace2 = go.Bar(x = table1['file_name'],
                y = table1['doccount']['sum'],
                name = 'document count')

data = [trace1,trace2]


iplot({
    "data":data,
    "layout":go.Layout(title="Distribution of number of emails received", 
                       xaxis={'title':'Num of emails'}, 
                       yaxis={'title':'Percentage of people received %'})})

In [None]:
table2 = cleaned_data[['file_name','form','doccount']]\
.groupby(['file_name','form'])\
.agg(['count','sum'])\
.sort_values(by=['file_name'])\
.reset_index()
print(list(table2))
table2

## Extract Features

Notes:
It is very very very very hard to parse old sec financial table, more recent files are easier 

https://github.com/ragraw26/Edgar-COMPANY-FILINGS-Web-Scrapping-Data-Analysis/blob/master/Data%20Scrapping/Team_5_Part1_Report.pdf

https://www.sec.gov/cgi-bin/viewer?action=view&cik=732712&accession_number=0001193125-10-041685&xbrl_type=v

https://www.codeproject.com/Articles/1227268/Accessing-Financial-Reports-in-the-EDGAR-Database

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all-next-and-find-next

In [12]:
cleaned_verizon_data[cleaned_verizon_data['form']=='8-K']['sec_full_path'][0]

'https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt'

In [4]:
cleaned_verizon_data[['file_name','form']]\
.groupby(['form'])\
.agg(['count'])\
.reset_index()

Unnamed: 0_level_0,form,file_name
Unnamed: 0_level_1,Unnamed: 1_level_1,count
0,10-K,21
1,10-Q,74
2,8-K,430


In [173]:
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000587.txt'
test_file = request.urlopen(test_html).read().decode('utf8')

In [161]:
#r = re.compile(r'\table\b | \bCAPTION\b', flags=re.I | re.X)
#r = re.compile(r'\table\b\bCAPTION\b', flags=re.I | re.X)
r = re.compile(r'\<table\>\s*\<caption\>')

In [243]:
soup = BeautifulSoup(test_file,'html.parser')
table = soup.find_all('table')

In [255]:
table[3]

<table>
<caption> 
                                                                                          Held
        Name                  Age                      Office                             Since
        ----                  ---                      ------                             -----
<s> <c> <c> <c>  
Raymond W. Smith...........   56  Chairman of the Board and Chief Executive Officer        1989
James G. Cullen............   51  President                                                1993
William O. Albertini.......   50  Vice President and Chief Financial Officer               1991
Joseph T. Ambrozy..........   54  Vice President - Strategic Planning                      1992
Lawrence T. Babbio, Jr.....   49  Chairman, President and Chief Executive Officer,         1991
                                        Bell Atlantic Enterprises International, Inc.
P. Alan Bulliner...........   50  Vice President - Corporate Secretary and Counsel         1992  
Barbara L. C

In [252]:
table[3].get_text()

'\n \n                                                                                          Held\n        Name                  Age                      Office                             Since\n        ----                  ---                      ------                             -----\n     \nRaymond W. Smith...........   56  Chairman of the Board and Chief Executive Officer        1989\nJames G. Cullen............   51  President                                                1993\nWilliam O. Albertini.......   50  Vice President and Chief Financial Officer               1991\nJoseph T. Ambrozy..........   54  Vice President - Strategic Planning                      1992\nLawrence T. Babbio, Jr.....   49  Chairman, President and Chief Executive Officer,         1991\n                                        Bell Atlantic Enterprises International, Inc.\nP. Alan Bulliner...........   50  Vice President - Corporate Secretary and Counsel         1992  \nBarbara L. Connor.........

In [None]:
soup = BeautifulSoup(html)
table = soup.find("table", attrs={"class":"details"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
    dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
    datasets.append(dataset)

In [6]:
##8-k
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000587.txt'
test_file = request.urlopen(test_html).read().decode('utf8')