In [48]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pandas as pd
import zipfile
import requests
from io import BytesIO
import logging

# remove all handlers associated with the toot logger object
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
    
# set up logging to file
logging.basicConfig(filename='missing_data_log.log', filemode='w', level=logging.INFO,
                   format = '%(asctime)s%(levelname)-8s%(message)s')

# opening and parse EDGAR Log File Data Set html
edgarLogUrl = 'https://www.sec.gov/dera/data/edgar-log-file-data-set.html'
edgarLogPage = BeautifulSoup(urlopen(edgarLogUrl),'lxml')
logging.info('Opening and parse EDGAR Log File Data Set html.')

# get log file html of specified year
year = '2003'
try:
    for link in edgarLogPage.findAll('a'):
        if link.text == year:
            edgarLogFiles = BeautifulSoup(urlopen('https://www.sec.gov' + link.get('href')),'lxml')
            logging.info('Get log file html of specified year.')
            break
except Exception:
    loggint.warning('####Error, no ',year,'log file data set found！')

# get url of each month
monthList=[]
for i, link in enumerate(edgarLogFiles.findAll('a')):
    if(re.match(r'.*01.zip$',link.text)):
        monthList.insert(12-i,link.get('href'))
logging.info('Get url of each month.')

# download zip and parse csv file
content = requests.get(monthList[9])
zf = zipfile.ZipFile(BytesIO(content.content))
for name in zf.namelist():
    if (re.match(r'.*.csv$',name)):
        df = pd.read_csv(zf.open(name)) 
logging.info('Download zip and parse csv file')

# fill 'unknown' for missing data in column 'browser'
df['browser'].fillna('unknown', inplace = True)
logging.info('Fill \'unknown\' for missing data in column \'browser\'')

# fill average for missing data in column 'size'
df['size'].fillna(df['size'].mean(), inplace = True)
logging.info('Fill average for missing data in column \'size\'')



http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr1/log20030101.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr1/log20030201.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr1/log20030301.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr2/log20030401.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr2/log20030501.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr2/log20030601.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr3/log20030701.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr3/log20030801.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr3/log20030901.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr4/log20031001.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr4/log20031101.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr4/log20031201.zip


In [20]:
df.head()

Unnamed: 0,ip,date,time,zone,cik,accession,extention,code,size,idx,norefer,noagent,find,crawler,browser
0,129.105.133.bcf,2003-10-01,00:00:23,400.0,900405.0,0000950134-02-001349,.txt,200.0,7702.0,0.0,1.0,0.0,0.0,1.0,unknown
1,129.105.133.bcf,2003-10-01,00:00:23,400.0,891024.0,0001045969-02-000262,.txt,200.0,8675.0,0.0,1.0,0.0,0.0,1.0,unknown
2,129.105.133.bcf,2003-10-01,00:00:47,400.0,893949.0,0001047469-02-002139,.txt,200.0,7377.0,0.0,1.0,0.0,0.0,1.0,unknown
3,164.164.89.djf,2003-10-01,00:01:37,400.0,802681.0,0001181431-03-024733,-index.htm,200.0,2726.0,1.0,0.0,0.0,1.0,0.0,win
4,12.163.71.fdh,2003-10-01,00:01:38,400.0,54058.0,0000897069-03-000996,-index.htm,200.0,2379.0,1.0,0.0,0.0,1.0,0.0,win


In [21]:
df.isnull().sum()

ip           0
date         0
time         0
zone         0
cik          0
accession    0
extention    0
code         0
size         0
idx          0
norefer      0
noagent      0
find         0
crawler      0
browser      0
dtype: int64

In [31]:
df.describe()

Unnamed: 0,zone,cik,code,size,idx,norefer,noagent,find,crawler
count,119840.0,119840.0,119840.0,119840.0,119840.0,119840.0,119840.0,119840.0,119840.0
mean,400.0,808558.9,215.585614,140697.3,0.394351,0.310664,0.014603,3.342874,0.04269
std,0.0,392961.1,37.675826,488324.6,0.488713,0.462767,0.119957,3.906975,0.202159
min,400.0,20.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,400.0,732712.0,200.0,3044.0,0.0,0.0,0.0,0.0,0.0
50%,400.0,921503.0,200.0,9021.5,0.0,0.0,0.0,1.0,0.0
75%,400.0,1089567.0,200.0,140697.3,1.0,1.0,0.0,9.0,0.0
max,400.0,1265723.0,416.0,64288350.0,1.0,1.0,1.0,10.0,1.0


In [24]:
import numpy as np
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))
outliers_iqr(df['zone'])

(array([], dtype=int64),)

In [49]:
for i in outliers_iqr(df['zone']):
    print(i)

[]


In [37]:
outliers_iqr(df['zone'])

(array([], dtype=int64),)

In [51]:
quartile_1, quartile_3 = np.percentile(df['zone'], [25, 75])
print(quartile_1)

400.0
