# Libraries

In [1]:
import os
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
import re
import plotly.plotly as py
import plotly.graph_objs as go
import requests
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import nltk
from nltk import word_tokenize, everygrams
from nltk.util import ngrams

# Obtain data

## Function Definitions

In [None]:
def single_read_in_text(file_path,file_name):
    '''
    input:
    file_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    file_name: name of the file e.g 21centfoxinc_sec_files.csv
    
    output:
    a sigle pandas dataframe with all the original columns from the input file + a column for the 'cleaned' data
    
    dev:
    1) can add more output columns for features
    2) can further clean the data 
    
    '''
    
    ## read in single csv to pandas 
    individule_csv = pd.read_csv(file_path+"/"+file_name)
    raw_texts = []
    clean_texts = []
    
    ## create a connection with the url link and readin the raw file
    for url in individule_csv['sec_full_path']:
        print(url)
        raw_texts.append([url,request.urlopen(url).read().decode('utf8')])
    
    ## clean the raw file by:
    ## 1. remove html tags 
    ## 2. break the text by "\n"
    ## 3. remove the spaces in the front of and after each "\n"
    
    for raw_file in raw_texts:
        
        
        
        clean_texts.append([raw_file[0],
                            '%%'.join(list(filter(None,
                                                 [re.sub('[\t]+', ' ', i.strip()) for 
                                                  i in BeautifulSoup(raw_file[1], "lxml").text.split('\n')])))])
        ## more columns, features, data cleanings can be put here
    
    ## merge back to the original read in dataframe 
    clean_texts_df = pd.DataFrame(clean_texts)
    clean_texts_df.columns = ['sec_full_path', 'text']
    merged_df = pd.merge(left = individule_csv, right = clean_texts_df, on = 'sec_full_path')
    
    ## add one more column to indicate the file name 
    merged_df['file_name'] = file_name
    return merged_df
def folder_read_in_text(folder_path, ext = '.csv'):
    '''
    input: 
    folder_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    ext: extension of the files that are interested, default to be .csv 
    
    output:
    a sigle pandas dataframe with all the original columns from all the input files inside the folder
    + a column for the 'cleaned' data
    + a column for the file name 
    
    Utilize the single_read_in_text function 
    '''
    file = []
    direc = folder_path 

    # Select only files with the ext extension
    txt_files = [i for i in os.listdir(direc) if os.path.splitext(i)[1] == ext]
    temp_df = pd.DataFrame()
    
    ## Utilize the single_read_in_text function to process data

    for i in txt_files:
        temp_df = temp_df.append(single_read_in_text(file_path = folder_path,file_name = i), ignore_index=True)
    return temp_df

## Function Call

In [None]:
#single_read_in_text('C:/Users/li haoran/Desktop/New folder','21centfoxinc_sec_files.csv')
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/Inputs',
                                   ext = '.csv')

## Write Output

In [None]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data.csv', sep='|', encoding='utf-8')

# Further processing 

## Read documents

In [2]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## Further clean the data

In [None]:
words = set(nltk.corpus.words.words())
cleaned_data['only_eng_words'] = cleaned_data['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) \
                                      if w.lower() in words))
cleaned_data['evy_gram_1_3'] =cleaned_data['only_eng_words'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 4)])

In [None]:
cleaned_data

## Write Output

In [None]:
cleaned_data.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated.csv', sep='|', encoding='utf-8')

# Analysis

## Read documents

In [None]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## EDA

In [None]:
table1 = cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()
print(list(table1))
table1

In [None]:
trace1 = go.Bar(x = table1['file_name'], 
                y = table1['doccount']['count'],
                name = 'filing count')

trace2 = go.Bar(x = table1['file_name'],
                y = table1['doccount']['sum'],
                name = 'document count')

data = [trace1,trace2]


iplot({
    "data":data,
    "layout":go.Layout(title="Distribution of number of emails received", 
                       xaxis={'title':'Num of emails'}, 
                       yaxis={'title':'Percentage of people received %'})})

In [None]:
table2 = cleaned_data[['file_name','form','doccount']]\
.groupby(['file_name','form'])\
.agg(['count','sum'])\
.sort_values(by=['file_name'])\
.reset_index()
print(list(table2))
table2

In [None]:
cleaned_data