# Libraries

In [64]:
import os
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
import re
import plotly.plotly as py
import plotly.graph_objs as go
import requests
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import nltk
from nltk import word_tokenize, everygrams
from nltk.util import ngrams

# Obtain data

## Function Definitions

In [None]:
def single_read_in_text(file_path,file_name):
    '''
    input:
    file_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    file_name: name of the file e.g 21centfoxinc_sec_files.csv
    
    output:
    a sigle pandas dataframe with all the original columns from the input file + a column for the 'cleaned' data
    
    dev:
    1) can add more output columns for features
    2) can further clean the data 
    
    '''
    
    ## read in single csv to pandas 
    individule_csv = pd.read_csv(file_path+"/"+file_name)
    raw_texts = []
    clean_texts = []
    
    ## create a connection with the url link and readin the raw file
    for url in individule_csv['sec_full_path']:
        print(url)
        raw_texts.append([url,request.urlopen(url).read().decode('utf8')])
    
    ## clean the raw file by:
    ## 1. remove html tags 
    ## 2. break the text by "\n"
    ## 3. remove the spaces in the front of and after each "\n"
    
    for raw_file in raw_texts:
        
        
        
        clean_texts.append([raw_file[0],
                            '%%'.join(list(filter(None,
                                                 [re.sub('[\t]+', ' ', i.strip()) for 
                                                  i in BeautifulSoup(raw_file[1], "lxml").text.split('\n')])))])
        ## more columns, features, data cleanings can be put here
    
    ## merge back to the original read in dataframe 
    clean_texts_df = pd.DataFrame(clean_texts)
    clean_texts_df.columns = ['sec_full_path', 'text']
    merged_df = pd.merge(left = individule_csv, right = clean_texts_df, on = 'sec_full_path')
    
    ## add one more column to indicate the file name 
    merged_df['file_name'] = file_name
    return merged_df

def folder_read_in_text(folder_path, ext = '.csv'):
    '''
    input: 
    folder_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    ext: extension of the files that are interested, default to be .csv 
    
    output:
    a sigle pandas dataframe with all the original columns from all the input files inside the folder
    + a column for the 'cleaned' data
    + a column for the file name 
    
    Utilize the single_read_in_text function 
    '''
    file = []
    direc = folder_path 

    # Select only files with the ext extension
    txt_files = [i for i in os.listdir(direc) if os.path.splitext(i)[1] == ext]
    temp_df = pd.DataFrame()
    
    ## Utilize the single_read_in_text function to process data

    for i in txt_files:
        temp_df = temp_df.append(single_read_in_text(file_path = folder_path,file_name = i), ignore_index=True)
    return temp_df

## Function Call

In [None]:
#single_read_in_text('C:/Users/li haoran/Desktop/New folder','21centfoxinc_sec_files.csv')
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/Inputs',
                                   ext = '.csv')

## Write Output

In [None]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data.csv', sep='|', encoding='utf-8')

# Further processing 

## Read documents

In [78]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## Further clean the data

In [87]:
words = set(nltk.corpus.words.words())
cleaned_data['only_eng_words'] = cleaned_data['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) \
                                      if w.lower() in words))
cleaned_data['evy_gram_1_3'] =cleaned_data['only_eng_words'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 4)])

In [88]:
cleaned_data

Unnamed: 0.1,Unnamed: 0,fdate,cik,findexdate,form,coname,fsize,doccount,sec_full_path,text,file_name,only_eng_words,evy_gram_1_3
0,0,2004-11-24,1308161,2004-11-24,8-K,NEWS CORP,5871753.0,3.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
1,1,2004-12-03,1308161,2004-12-03,8-K,NEWS CORP,277725.0,5.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
2,2,2004-12-23,1308161,2004-12-23,8-K,NEWS CORP,925535.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
3,3,2005-01-10,1308161,2005-01-10,8-K,NEWS CORP,38265.0,3.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
4,4,2005-01-27,1308161,2005-01-27,8-K,NEWS CORP,25006.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
5,5,2005-02-02,1308161,2005-02-02,8-K,NEWS CORP,318491.0,4.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
6,6,2005-02-03,1308161,2005-02-03,8-K,NEWS CORP,239034.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
7,7,2005-02-04,1308161,2005-02-04,10-Q,NEWS CORP,1518464.0,8.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
8,8,2005-02-08,1308161,2005-02-08,8-K,NEWS CORP,26164.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."
9,9,2005-02-22,1308161,2005-02-22,8-K,NEWS CORP,40749.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"[BEGIN, PRIVACY, ENHANCED, MESSAGE, Type, CLEA..."


## Write Output

In [89]:
cleaned_data.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated.csv', sep='|', encoding='utf-8')

# Analysis

## Read documents

In [None]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## EDA

In [41]:
table1 = cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()
print(list(table1))
table1

[('file_name', ''), ('doccount', 'count'), ('doccount', 'sum')]


Unnamed: 0_level_0,file_name,doccount,doccount
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,21centfoxinc_sec_files.csv,1048,3978.0
1,attinc_sec_files.csv,555,3383.0
2,cbscorp_sec_files.csv,385,2578.0
3,comcastcorp_sec_files.csv,277,2735.0
4,verizoncom_sec_files.csv,525,3319.0


In [52]:
trace1 = go.Bar(x = table1['file_name'], 
                y = table1['doccount']['count'],
                name = 'filing count')

trace2 = go.Bar(x = table1['file_name'],
                y = table1['doccount']['sum'],
                name = 'document count')

data = [trace1,trace2]


iplot({
    "data":data,
    "layout":go.Layout(title="Distribution of number of emails received", 
                       xaxis={'title':'Num of emails'}, 
                       yaxis={'title':'Percentage of people received %'})})

In [53]:
table2 = cleaned_data[['file_name','form','doccount']]\
.groupby(['file_name','form'])\
.agg(['count','sum'])\
.sort_values(by=['file_name'])\
.reset_index()
print(list(table2))
table2

[('file_name', ''), ('form', ''), ('doccount', 'count'), ('doccount', 'sum')]


Unnamed: 0_level_0,file_name,form,doccount,doccount
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum
0,21centfoxinc_sec_files.csv,10-K,14,656.0
1,21centfoxinc_sec_files.csv,10-Q,41,1062.0
2,21centfoxinc_sec_files.csv,8-K,993,2260.0
3,attinc_sec_files.csv,10-K,21,888.0
4,attinc_sec_files.csv,10-Q,74,1056.0
5,attinc_sec_files.csv,8-K,460,1439.0
6,cbscorp_sec_files.csv,10-K,20,694.0
7,cbscorp_sec_files.csv,10-Q,74,1144.0
8,cbscorp_sec_files.csv,8-K,291,740.0
9,comcastcorp_sec_files.csv,10-K,16,845.0


In [54]:
cleaned_data

Unnamed: 0.1,Unnamed: 0,fdate,cik,findexdate,form,coname,fsize,doccount,sec_full_path,text,file_name
0,0,2004-11-24,1308161,2004-11-24,8-K,NEWS CORP,5871753.0,3.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
1,1,2004-12-03,1308161,2004-12-03,8-K,NEWS CORP,277725.0,5.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
2,2,2004-12-23,1308161,2004-12-23,8-K,NEWS CORP,925535.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
3,3,2005-01-10,1308161,2005-01-10,8-K,NEWS CORP,38265.0,3.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
4,4,2005-01-27,1308161,2005-01-27,8-K,NEWS CORP,25006.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
5,5,2005-02-02,1308161,2005-02-02,8-K,NEWS CORP,318491.0,4.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
6,6,2005-02-03,1308161,2005-02-03,8-K,NEWS CORP,239034.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
7,7,2005-02-04,1308161,2005-02-04,10-Q,NEWS CORP,1518464.0,8.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
8,8,2005-02-08,1308161,2005-02-08,8-K,NEWS CORP,26164.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
9,9,2005-02-22,1308161,2005-02-22,8-K,NEWS CORP,40749.0,2.0,https://www.sec.gov/Archives/edgar/data/130816...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,21centfoxinc_sec_files.csv
