In [3]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import FreqDist
from langdetect import detect
import pandas as pd
import string

In [6]:
# document tokenization after text pre-preprocessing to differentiate types then token based on type

input_path = 'C:\\test'
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

d = pd.DataFrame(columns=['document', 'sentences', 'words', 'pos'])
freq = FreqDist()

In [7]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    # Convert double newlines into single newlines
    psd.replace('\n\n', '\n')
    return psd

In [8]:
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [9]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [10]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = dataframe['sentences'].apply(word_tokenize)
    dataframe['words'] = dataframe['words'].apply(lambda x: [item.lower() for item in x])
    dataframe['words'] = dataframe['words'].apply(lambda x: [item.strip(string.punctuation) for item in x])
    dataframe['words'] = dataframe['words'].apply(lambda x: [item for item in x if item.isalpha()])
    dataframe['words'] = dataframe['words'].apply(lambda x: [item for item in x if item not in stop_words])
    dataframe['pos'] = dataframe['words'].apply(nltk.pos_tag)
    return dataframe

In [None]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)
    fdist = nltk.FreqDist(tokenised)
    freq += fdist


    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename,sentences])
    print(temp)
    d = d.append(temp,ignore_index=True)
    print(d.head())
#     pd.concat([d, pd.DataFrame([[filename, sentences]])])

    # check for output folder and build if it doesn't exist
    # if not os.path.exists(input_path + '\\output\\' + fname):
    #     os.makedirs(input_path + '\\output\\' + fname)
    # # write out the text extracted by tika
    # f = open(input_path + '\\output\\' + '\\' + fname + '\\' + os.path.splitext(filename)[0] + '.txt', 'wb')
    # f.write(parsed.encode('utf-8').strip())
    # f.close()

031918comments2.authcheckdam.pdf
0                     031918comments2.authcheckdam.pdf
1    [\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
dtype: object
  document sentences words  pos  \
0      NaN       NaN   NaN  NaN   
1      NaN       NaN   NaN  NaN   
2      NaN       NaN   NaN  NaN   
3      NaN       NaN   NaN  NaN   
4      NaN       NaN   NaN  NaN   

                                                   0    1  
0                   031918comments2.authcheckdam.pdf  NaN  
1  [\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...  NaN  
2                 881961_CHECKLIST-2014_rev62714.pdf  NaN  
3  [\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...  NaN  
4                              DomesticWireFunds.pdf  NaN  
881961_CHECKLIST-2014_rev62714.pdf
0                   881961_CHECKLIST-2014_rev62714.pdf
1    [\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
dtype: object
  document sentences words  pos  \
0      NaN       NaN   NaN  NaN   
1      NaN       NaN   NaN  NaN   
2      N

In [20]:
d.head()

Unnamed: 0,document,sentences,words,pos


In [None]:
# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(doc)


# TODO - clean up \n lines

01,-,Good,bank,statement.pdf
01,-,Good,bank,statement


2018-08-15 10:23:38,965 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


031918comments2.authcheckdam.pdf
031918comments2.authcheckdam
881961_CHECKLIST-2014_rev62714.pdf
881961_CHECKLIST-2014_rev62714
bank-reconciliation-example.pdf
bank-reconciliation-example
Bishop_Book_4_eBook.pdf
Bishop_Book_4_eBook
britain_mag_media_pack.pdf
britain_mag_media_pack
c07Chemicalreactions_WEB.pdf
c07Chemicalreactions_WEB
cassandra_thedefinitiveguide.pdf
cassandra_thedefinitiveguide
children result( Individula and together ) v1 7-3-16.docx
children result( Individula and together ) v1 7-3-16
Correct bank statement.pdf
Correct bank statement
D3S_EN.pdf
D3S_EN
datascienceatthecommandline.pdf
datascienceatthecommandline
dis5790_parrainage_mmf_a5_4.pdf
dis5790_parrainage_mmf_a5_4
DomesticWireFunds.pdf
DomesticWireFunds
DTM_AprMay_2018.pdf
DTM_AprMay_2018
dubai 1 2.pdf
dubai 1 2
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
Early social interaction project for childen with autism   begining in the second year of life (1)

In [2]:
import pandas as pd
d = pd.DataFrame(columns=['document', 'sentences', 'words', 'pos'])

