In [0]:
from google.colab import drive
drive.mount('/content/drive')

## Data Preprocessing
The cells below preprocess the data.

Prerequisites for the data processing to work.

1) Create a folder in you google drive final-project/raw/<quartername>
  
2) Download the zip fles from the following URL
  https://www.sec.gov/dera/data/financial-statement-data-sets.html
  
   For the purpose of this project I have used the last 4 quarters.
  
##Step 1.  Extract the raw data files.

In [0]:
import zipfile

with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2018q3_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2018q3')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2018q4_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2018q4')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2019q1_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2019q1')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2019q2_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2019q2')

In [0]:
with open("/content/drive/My Drive/final-project/raw/extracted/2018q3/txt.tsv") as myfile:
    head = [next(myfile) for x in range(10)]
print(head)

['adsh\ttag\tversion\tddate\tqtrs\tiprx\tlang\tdcml\tdurp\tdatp\tdimh\tdimn\tcoreg\tescaped\tsrclen\ttxtlen\tfootnote\tfootlen\tcontext\tvalue\n', '0001213900-18-008637\tIncomeTaxPreferentialRateDescription\t0001213900-18-008637\t20171231\t4\t0\ten-US\t32767\t0.01095891\t0.0\t0xf7f18cbab3149e06b113a7cc4ac14319\t1\t\t0\t357\t357\t\t0\tContext_FYE_01_Jan_2017T00_00_00_TO_31_Dec_2017T00_00_00_IncomeTaxAuthorityAxis_BorqsBeijingMember\tBORQS Beijing was qualified for a High and New Technology Enterprises ("HNTE") since 2012 and is eligible for a 15% preferential tax rate from 2012 to 2014. In July 2015, BORQS Beijing obtained a new HNTE certificate, which will expire in July 2018. For the years ended December 31, 2015, 2016 and 2017, BORQS Beijing enjoyed a preferential tax rate of 15%.\n', '0001213900-18-008637\tIncomeTaxPreferentialRateDescription\t0001213900-18-008637\t20171231\t4\t0\ten-US\t32767\t0.01095891\t0.0\t0xf9d8f3c6bb34f4007d5655734f5ee93a\t1\t\t0\t605\t605\t\t0\tContext_FYE_0

In [0]:
import pandas as pd

#folders = ['2018q3/','2018q4/','2019q1/','2019q2/']
folders = ['2018q3/']
extract_folder = '/content/drive/My Drive/final-project/raw/extracted/'
context_file = 'txt.tsv'

filenames = []
for folder in folders:
  filename = extract_folder + folder + context_file
  filenames.append(filename)
print(filenames)
  
dfs = pd.concat([pd.read_csv(f, sep='\t') for f in filenames], ignore_index = True)
    
print(dfs.columns.values)

['/content/drive/My Drive/final-project/raw/extracted/2018q3/txt.tsv']
['adsh' 'tag' 'version' 'ddate' 'qtrs' 'iprx' 'lang' 'dcml' 'durp' 'datp'
 'dimh' 'dimn' 'coreg' 'escaped' 'srclen' 'txtlen' 'footnote' 'footlen'
 'context' 'value']


## Step 2 gather some statistics on the data to further cleanse and reduce noise

In [0]:
##print(dfs.loc[: , "value"])
#Fetch wordcount for each content value
dfs['word_count'] = dfs['value'].apply(lambda x: len(str(x).split(" ")))
dfs[['value','word_count']].head()

Unnamed: 0,value,word_count
0,BORQS Beijing was qualified for a High and New...,62
1,Yuantel Telecom was qualified for a High and N...,102
2,(l) Impairment of long-lived assets: The Compa...,240
3,"For the years ended December 31, 2015 2016 201...",42
4,"Three Months Ended March 31, 2017 2018 US$ US$...",34


In [0]:
##Descriptive statistics of word counts
dfs.word_count.describe()

count    455141.000000
mean         96.609286
std         120.663369
min           1.000000
25%           1.000000
50%          40.000000
75%         167.000000
max         666.000000
Name: word_count, dtype: float64

In [0]:
#Identify common words
import pandas
#Identify common words
freq = pandas.Series(''.join(map(str,dfs['value'])).split()).value_counts()[:20]
freq

the        1874070
of         1466463
$          1254879
and        1206969
to          698295
in          591267
)           498098
for         440268
2018        361920
a           356613
June        350656
30,         345073
The         343280
as          295407
on          271064
2017        269819
are         249429
Company     245354
is          239585
or          208053
dtype: int64

In [0]:
#Identify uncommon words
freq1 =  pandas.Series(''.join(map(str,dfs['value'])).split()).value_counts()[:-20]
freq1

# Pre Processing the text.
Now that we have the basic stats lets do some pre processing to remove noise and normalize the data.Data components that are redundant to the core text analytics can be considered as noise.

In [0]:
import nltk
import re
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer 


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ['Jan', 'Janurary', 'Feb', 'February', 'March', 'April', 'May', 'Jun', 'June', 'July',
            'Aug', 'August', 'Sept', 'September', 'Oct', 'October', 'Nov', 'November', 'Dec', 'December', 
             'Month', 'Ended', 'Ending', 'Three', 'Period']
stop_words = stop_words.union(new_words)

In [0]:
#Build the corpus and save it.
corpus = []
for i in range(0, 5000):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(dfs['value'][i]))
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words]
    text = [ word for word in text if len(word) > 3 ]
    if len(text) > 0:
      text = " ".join(text)
      corpus.append(text)

In [0]:
#View corpus item
corpus[:100]

['borqs beijing qualified high technology enterprise hnte since eligible preferential rate july borqs beijing obtained hnte certificate expire july year ended december borqs beijing enjoyed preferential rate',
 'yuantel telecom qualified high technology enterprise hnte since eligible preferential rate october yuantel telecom obtained hnte certificate expired october yuantel telecom successfully renewed hnte certificate december effective term three year accordance income enterprise awarded hnte status enjoy reduced rate year ended december yuantel telecom enjoyed preferential rate',
 'impairment long lived asset company periodically review estimated useful life depreciable asset change useful life made prospective basis unless factor indicate carrying amount asset recoverable impairment write necessary however company review long lived asset impairment event change circumstance indicate carrying amount asset recoverable impairment loss would recognized estimated undiscounted future cas

In [0]:
## Write corpus to a file.

In [0]:
## Write data cleansed to the file.
with open('/content/drive/My Drive/final-project/corpus/corpus.txt', 'w+') as f:
    for line in corpus:
        f.write("%s\n" % line)