In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.cluster import KMeans
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import brown
from nltk.tag import RegexpTagger
from nltk.tag import UnigramTagger
from nltk.stem import PorterStemmer

In [2]:
regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)

In [3]:
# Define location of files and keywords - TODO parameterise these
stemmer = SnowballStemmer('english')
pstemmer = PorterStemmer()

input_path = 'C:\\test'
stop_words = set(stopwords.words('english'))
keywords = ['IS', 'terrorism', 'bomb', 'is', 'the', 'consortium']
poskeywords = unigram_tagger.tag(keywords)
stemkeywords = unigram_tagger.tag([pstemmer.stem(t) for t in keywords])
snowstemkeywords = unigram_tagger.tag([stemmer.stem(t) for t in keywords])

# Set up Dataframe
d = pd.DataFrame()

# Create a list to use for clustering
doclist = []

In [4]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [5]:
# Return NLTK text from the document - used to filter out short documents but may
# also be used for further processing in future dev
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [6]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [43]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item) for item in x]))
    dataframe['pos'] = dataframe['words'].apply(lambda x: [nltk.pos_tag(item) for item in x])
    dataframe['allwords'] = dataframe['words'].apply(lambda x: [item.strip(string.punctuation).lower() for sublist
                                                        in x for item in sublist])
    dataframe['allwords'] = (dataframe['allwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                               and item not in stop_words]))
    dataframe['mfreq'] = dataframe['allwords'].apply(nltk.FreqDist)
    
    dataframe['poslist'] = dataframe['pos'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreqpos'] = dataframe['poslist'].apply(nltk.FreqDist)
    
    dataframe['stemwords'] = dataframe['words'].apply(lambda x: [pstemmer.stem(item) for sublist in x for item in sublist])
    dataframe['stemwords'] = (dataframe['stemwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                               and item not in stop_words]))
    dataframe['mfreqstem'] = dataframe['stemwords'].apply(nltk.FreqDist)
        
    return dataframe

In [8]:
# Score documents based on cleansed dataset - so should discount stopwords and be sensible
def scoring(dataframe):
    word_matches = defaultdict(list)
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['allwords']:
                dataframe.loc[idx, 'score'] += (row['mfreq'][word] * 0.75)
                if not row['document'] in word_matches[word]:
                    word_matches[word].append(row['document'])
    print('\n')
    print('The following keyword hits occurred:')

    for key, val in word_matches.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

    return dataframe

In [23]:
# Score documents based on pos
def scoringpos(dataframe):
    word_matches = defaultdict(list)
    for (w1, t1) in poskeywords:
        for idx, row in dataframe.iterrows():
             if (w1,t1) in row['poslist']:
                    dataframe.loc[idx, 'score'] += row['mfreqpos'][(w1,t1)]
                    if not row['document'] in word_matches[w1]:
                        word_matches[w1].append(row['document'])
    print('\n')
    print('The following keyword hits occurred:')

    for key, val in word_matches.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

    return dataframe

In [10]:
# Find keywords using POS
def contextkeywords(dataframe):
    print('\n')
    print('Here are the keywords in context: ')
    # Search for IS as a noun
    for idx, row in dataframe.iterrows():
        for index, r in enumerate(row['pos']):
            for (w1, t1) in r:
                if w1 == 'IS' and t1 == 'NNP':
                    print(row['pos'][index])
                    print('\n')

    return dataframe

In [11]:
# Sort using a dirty model
def dirtyscoring(dataframe):
    dataframe['score2'] = 0
    dataframe['w2'] = dataframe['words'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreq2'] = dataframe['w2'].apply(nltk.FreqDist)

    word_matches = defaultdict(list)
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['w2']:
                dataframe.loc[idx, 'score2'] += row['mfreq2'][word]
                if not row['document'] in word_matches[word]:
                    word_matches[word].append(row['document'])
    print('\n')
    print('The following keyword hits occurred in the uncleansed data:')

    for key, val in word_matches.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

    return dataframe

In [12]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [13]:
# Cluster documents and demonstrate prediction
# TODO - calculate ideal k value
def clustering(documents):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.2, use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    X = vectorizer.fit_transform(doclist)

    true_k = 5
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)

    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i),
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind]),
        print

    print("\n")
    print("Prediction")

    Y = vectorizer.transform(["this is a document about islamic state "
                              "and terrorists and bombs IS jihad terrorism isil"])
    prediction = model.predict(Y)
    print("A document with 'bad' terms would be in:")
    print(prediction)

    Y = vectorizer.transform(["completely innocent text just about kittens and puppies"])
    prediction = model.predict(Y)
    print("A document with 'good' terms would be in:")
    print(prediction)

In [14]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [15]:
def nmflda(documentlist):
    no_features = 1000

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documentlist)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documentlist)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 5

    # Run NMF
    nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

    # Run LDA
    lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    no_top_words = 10
    print("NMF Topics: ")
    display_topics(nmf, tfidf_feature_names, no_top_words)
    print("LDA Topics: ")
    display_topics(lda, tf_feature_names, no_top_words)

In [16]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
print('Starting processing - the following files have been processed:')
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)

    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Create doclist for use in topic modelling
    doclist.append(parsed)
    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

d.reset_index(drop=True, inplace=True)
d.columns = ['document', 'sentences']

Starting processing - the following files have been processed:
031918comments2.authcheckdam.pdf
881961_CHECKLIST-2014_rev62714.pdf
cassandra_thedefinitiveguide.pdf
children result( Individula and together ) v1 7-3-16.docx
datascienceatthecommandline.pdf
DomesticWireFunds.pdf
dubai 1 2.pdf
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
eng[1].htm
Kaplan, Andreas - Users of the world, unite.pdf
Kuwait job.docx
learningspark.pdf
Memes-and-the-evolution-of-religion-We-need-memetics-too.pdf
MSAB_License_Management_Brazilian Portuguese.pdf
MSAB_License_Management_Chinese.pdf
MSAB_License_Management_English.pdf
MSAB_License_Management_French.pdf
MSAB_License_Management_German.pdf
MSAB_License_Management_Japanese.pdf
MSAB_License_Management_Russian.pdf
MSAB_License_Management_Spanish.pdf
MSAB_License_Management_Turkish.pdf
Order Confirmation.pdf
Orderconf.pdf
P857_ImportantInformation-TermsAndConditions.pdf
Patient+Type+2+opt-out+letter

In [36]:
# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)
d['score'] = 0
d.head()

TypeError: unhashable type: 'list'

In [20]:
# Add scoring
# TODO - use POS/stemming to make better counts of words, deal with cases
scoring(d)



The following keyword hits occurred:
Keyword: terrorism. Found in these documents: 
['Religion-Security-Global-Uncertainties.pdf', 'START_AM2014_QuickFireTwo.pdf', 'START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
Keyword: bomb. Found in these documents: 
['Religion-Security-Global-Uncertainties.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf']
Keyword: consortium. Found in these documents: 
['START_AM2014_QuickFireTwo.pdf', 'START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']


Unnamed: 0,document,sentences,words,pos,allwords,mfreq,poslist,mfreqpos,stemwords,stempos,stemposlist,mfreqstempos,mfreqstem,score,scorepos
0,031918comments2.authcheckdam.pdf,[ Section of Taxation Suite 400 1050 Connectic...,"[[Section, of, Taxation, Suite, 400, 1050, Con...","[[(Section, NN), (of, IN), (Taxation, NNP), (S...","[section, taxation, suite, connecticut, avenue...","{'section': 25, 'taxation': 9, 'suite': 1, 'co...","[(Section, NN), (of, IN), (Taxation, NNP), (Su...","{('Section', 'NN'): 12, ('of', 'IN'): 188, ('T...","[section, of, taxat, suit, 400, 1050, connecti...","[[(s, NN), (e, NN), (c, VBP), (t, NN), (i, NN)...","[(s, NN), (e, NN), (c, VBP), (t, NN), (i, NN),...","{('s', 'NN'): 832, ('e', 'NN'): 2246, ('c', 'V...","{'section': 25, 'of': 188, 'taxat': 9, 'suit':...",0.0,0
1,881961_CHECKLIST-2014_rev62714.pdf,[ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...,"[[CHECKLIST-2014_rev62714, ORDER, CONFIRMATION...","[[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP),...","[order, confirmation, checklistorder, confirma...","{'order': 11, 'confirmation': 2, 'checklistord...","[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP), ...","{('CHECKLIST-2014_rev62714', 'JJ'): 1, ('ORDER...","[checklist-2014_rev62714, order, confirm, chec...","[[(c, NNS), (h, VBP), (e, JJ), (c, NNS), (k, V...","[(c, NNS), (h, VBP), (e, JJ), (c, NNS), (k, VB...","{('c', 'NNS'): 40, ('h', 'VBP'): 16, ('e', 'JJ...","{'checklist-2014_rev62714': 1, 'order': 12, 'c...",0.0,0
2,cassandra_thedefinitiveguide.pdf,[ Cassandra: The Definitive Guide Jeff Carpent...,"[[Cassandra, :, The, Definitive, Guide, Jeff, ...","[[(Cassandra, NN), (:, :), (The, DT), (Definit...","[cassandra, definitive, guide, jeff, carpenter...","{'cassandra': 1309, 'definitive': 11, 'guide':...","[(Cassandra, NN), (:, :), (The, DT), (Definiti...","{('Cassandra', 'NN'): 2, (':', ':'): 1430, ('T...","[cassandra, :, the, definit, guid, jeff, carpe...","[[(c, VB), (a, DT), (s, JJ), (s, NN), (a, DT),...","[(c, VB), (a, DT), (s, JJ), (s, NN), (a, DT), ...","{('c', 'VB'): 3639, ('a', 'DT'): 49160, ('s', ...","{'cassandra': 1304, ':': 1430, 'the': 6499, 'd...",0.0,0
3,children result( Individula and together ) v1 ...,[ The impact of adult interactive style on the...,"[[The, impact, of, adult, interactive, style, ...","[[(The, DT), (impact, NN), (of, IN), (adult, N...","[impact, adult, interactive, style, spontaneou...","{'impact': 12, 'adult': 5, 'interactive': 5, '...","[(The, DT), (impact, NN), (of, IN), (adult, NN...","{('The', 'DT'): 45, ('impact', 'NN'): 12, ('of...","[the, impact, of, adult, interact, style, on, ...","[[(t, NN), (h, NN), (e, NN)], [(i, JJ), (m, VB...","[(t, NN), (h, NN), (e, NN), (i, JJ), (m, VBP),...","{('t', 'NN'): 3124, ('h', 'NN'): 1022, ('e', '...","{'the': 297, 'impact': 12, 'of': 279, 'adult':...",0.0,0
4,datascienceatthecommandline.pdf,[ Data Science at the Command Line DATA /DATA ...,"[[Data, Science, at, the, Command, Line, DATA,...","[[(Data, NNP), (Science, NN), (at, IN), (the, ...","[data, science, command, line, data, data, sci...","{'data': 738, 'science': 176, 'command': 342, ...","[(Data, NNP), (Science, NN), (at, IN), (the, D...","{('Data', 'NNP'): 224, ('Science', 'NN'): 5, (...","[data, scienc, at, the, command, line, data, /...","[[(d, VB), (a, DT), (t, NN), (a, DT)], [(s, JJ...","[(d, VB), (a, DT), (t, NN), (a, DT), (s, JJ), ...","{('d', 'VB'): 1077, ('a', 'DT'): 18743, ('t', ...","{'data': 735, 'scienc': 175, 'at': 153, 'the':...",0.0,0
5,DomesticWireFunds.pdf,[ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...,"[[►►►►►PLEASE, PRINT◄◄◄◄◄, WIRE, TRANSFER, PAY...","[[(►►►►►PLEASE, NN), (PRINT◄◄◄◄◄, NNP), (WIRE,...","[wire, transfer, payment, order, confirmation,...","{'wire': 1, 'transfer': 3, 'payment': 3, 'orde...","[(►►►►►PLEASE, NN), (PRINT◄◄◄◄◄, NNP), (WIRE, ...","{('►►►►►PLEASE', 'NN'): 1, ('PRINT◄◄◄◄◄', 'NNP...","[►►►►►pleas, print◄◄◄◄◄, wire, transfer, payme...","[[(►, JJ), (►, NNP), (►, NNP), (►, NNP), (►, N...","[(►, JJ), (►, NNP), (►, NNP), (►, NNP), (►, NN...","{('►', 'JJ'): 1, ('►', 'NNP'): 4, ('p', 'NN'):...","{'►►►►►pleas': 1, 'print◄◄◄◄◄': 1, 'wire': 1, ...",0.0,0
6,dubai 1 2.pdf,[ Arab Social Media Report: Civil Movements: T...,"[[Arab, Social, Media, Report, :, Civil, Movem...","[[(Arab, JJ), (Social, NNP), (Media, NNP), (Re...","[arab, social, media, report, civil, movements...","{'arab': 121, 'social': 84, 'media': 65, 'repo...","[(Arab, JJ), (Social, NNP), (Media, NNP), (Rep...","{('Arab', 'JJ'): 96, ('Social', 'NNP'): 25, ('...","[arab, social, media, report, :, civil, moveme...","[[(a, DT), (r, NN), (a, DT), (b, NN)], [(s, NN...","[(a, DT), (r, NN), (a, DT), (b, NN), (s, NN), ...","{('a', 'DT'): 3916, ('r', 'NN'): 2959, ('b', '...","{'arab': 142, 'social': 84, 'media': 65, 'repo...",0.0,0
7,Early social interaction project for childen w...,"[ 03., Wetherby p67 TECSE 26:2 67–82 (2006) 67...","[[03, .], [Wetherby, p67, TECSE, 26:2, 67–82, ...","[[(03, CD), (., .)], [(Wetherby, NNP), (p67, N...","[wetherby, tecse, early, social, interaction, ...","{'wetherby': 31, 'tecse': 1, 'early': 76, 'soc...","[(03, CD), (., .), (Wetherby, NNP), (p67, NN),...","{('03', 'CD'): 1, ('.', '.'): 538, ('Wetherby'...","[03, ., wetherbi, p67, tecs, 26:2, 67–82, (, 2...","[[(0, CD), (3, CD)], [(., .)], [(w, NN), (e, N...","[(0, CD), (3, CD), (., .), (w, NN), (e, NN), (...","{('0', 'CD'): 475, ('3', 'CD'): 223, ('.', '.'...","{'03': 1, '.': 538, 'wetherbi': 31, 'p67': 1, ...",0.0,0
8,eng[1].htm,"[ The Secrets of the Qur’an’s Miracles, Websit...","[[The, Secrets, of, the, Qur, ’, an, ’, s, Mir...","[[(The, DT), (Secrets, NNS), (of, IN), (the, D...","[secrets, qur, miracles, website, abduldaem, s...","{'secrets': 11, 'qur': 8, 'miracles': 10, 'web...","[(The, DT), (Secrets, NNS), (of, IN), (the, DT...","{('The', 'DT'): 33, ('Secrets', 'NNS'): 2, ('o...","[the, secret, of, the, qur, ’, an, ’, s, mirac...","[[(t, NN), (h, NN), (e, NN)], [(s, NN), (e, NN...","[(t, NN), (h, NN), (e, NN), (s, NN), (e, NN), ...","{('t', 'NN'): 344, ('h', 'NN'): 178, ('e', 'NN...","{'the': 81, 'secret': 14, 'of': 50, 'qur': 8, ...",0.0,0
9,"Kaplan, Andreas - Users of the world, unite.pdf","[ Author's personal copy Users of the world, u...","[[Author, 's, personal, copy, Users, of, the, ...","[[(Author, NN), ('s, POS), (personal, JJ), (co...","[author, personal, copy, users, world, unite, ...","{'author': 12, 'personal': 22, 'copy': 11, 'us...","[(Author, NN), ('s, POS), (personal, JJ), (cop...","{('Author', 'NN'): 2, (''s', 'POS'): 10, ('per...","[author, 's, person, copi, user, of, the, worl...","[[(a, DT), (u, JJ), (t, NN), (h, NN), (o, NN),...","[(a, DT), (u, JJ), (t, NN), (h, NN), (o, NN), ...","{('a', 'DT'): 2770, ('u', 'JJ'): 758, ('t', 'N...","{'author': 12, ''s': 10, 'person': 28, 'copi':...",0.0,0


In [21]:
scoringpos(d)



The following keyword hits occurred:
Keyword: terrorism. Found in these documents: 
['Religion-Security-Global-Uncertainties.pdf', 'START_AM2014_QuickFireTwo.pdf', 'START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
Keyword: bomb. Found in these documents: 
['Religion-Security-Global-Uncertainties.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf']


Unnamed: 0,document,sentences,words,pos,allwords,mfreq,poslist,mfreqpos,stemwords,stempos,stemposlist,mfreqstempos,mfreqstem,score,scorepos
0,031918comments2.authcheckdam.pdf,[ Section of Taxation Suite 400 1050 Connectic...,"[[Section, of, Taxation, Suite, 400, 1050, Con...","[[(Section, NN), (of, IN), (Taxation, NNP), (S...","[section, taxation, suite, connecticut, avenue...","{'section': 25, 'taxation': 9, 'suite': 1, 'co...","[(Section, NN), (of, IN), (Taxation, NNP), (Su...","{('Section', 'NN'): 12, ('of', 'IN'): 188, ('T...","[section, of, taxat, suit, 400, 1050, connecti...","[[(s, NN), (e, NN), (c, VBP), (t, NN), (i, NN)...","[(s, NN), (e, NN), (c, VBP), (t, NN), (i, NN),...","{('s', 'NN'): 832, ('e', 'NN'): 2246, ('c', 'V...","{'section': 25, 'of': 188, 'taxat': 9, 'suit':...",0.0,0
1,881961_CHECKLIST-2014_rev62714.pdf,[ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...,"[[CHECKLIST-2014_rev62714, ORDER, CONFIRMATION...","[[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP),...","[order, confirmation, checklistorder, confirma...","{'order': 11, 'confirmation': 2, 'checklistord...","[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP), ...","{('CHECKLIST-2014_rev62714', 'JJ'): 1, ('ORDER...","[checklist-2014_rev62714, order, confirm, chec...","[[(c, NNS), (h, VBP), (e, JJ), (c, NNS), (k, V...","[(c, NNS), (h, VBP), (e, JJ), (c, NNS), (k, VB...","{('c', 'NNS'): 40, ('h', 'VBP'): 16, ('e', 'JJ...","{'checklist-2014_rev62714': 1, 'order': 12, 'c...",0.0,0
2,cassandra_thedefinitiveguide.pdf,[ Cassandra: The Definitive Guide Jeff Carpent...,"[[Cassandra, :, The, Definitive, Guide, Jeff, ...","[[(Cassandra, NN), (:, :), (The, DT), (Definit...","[cassandra, definitive, guide, jeff, carpenter...","{'cassandra': 1309, 'definitive': 11, 'guide':...","[(Cassandra, NN), (:, :), (The, DT), (Definiti...","{('Cassandra', 'NN'): 2, (':', ':'): 1430, ('T...","[cassandra, :, the, definit, guid, jeff, carpe...","[[(c, VB), (a, DT), (s, JJ), (s, NN), (a, DT),...","[(c, VB), (a, DT), (s, JJ), (s, NN), (a, DT), ...","{('c', 'VB'): 3639, ('a', 'DT'): 49160, ('s', ...","{'cassandra': 1304, ':': 1430, 'the': 6499, 'd...",0.0,0
3,children result( Individula and together ) v1 ...,[ The impact of adult interactive style on the...,"[[The, impact, of, adult, interactive, style, ...","[[(The, DT), (impact, NN), (of, IN), (adult, N...","[impact, adult, interactive, style, spontaneou...","{'impact': 12, 'adult': 5, 'interactive': 5, '...","[(The, DT), (impact, NN), (of, IN), (adult, NN...","{('The', 'DT'): 45, ('impact', 'NN'): 12, ('of...","[the, impact, of, adult, interact, style, on, ...","[[(t, NN), (h, NN), (e, NN)], [(i, JJ), (m, VB...","[(t, NN), (h, NN), (e, NN), (i, JJ), (m, VBP),...","{('t', 'NN'): 3124, ('h', 'NN'): 1022, ('e', '...","{'the': 297, 'impact': 12, 'of': 279, 'adult':...",0.0,0
4,datascienceatthecommandline.pdf,[ Data Science at the Command Line DATA /DATA ...,"[[Data, Science, at, the, Command, Line, DATA,...","[[(Data, NNP), (Science, NN), (at, IN), (the, ...","[data, science, command, line, data, data, sci...","{'data': 738, 'science': 176, 'command': 342, ...","[(Data, NNP), (Science, NN), (at, IN), (the, D...","{('Data', 'NNP'): 224, ('Science', 'NN'): 5, (...","[data, scienc, at, the, command, line, data, /...","[[(d, VB), (a, DT), (t, NN), (a, DT)], [(s, JJ...","[(d, VB), (a, DT), (t, NN), (a, DT), (s, JJ), ...","{('d', 'VB'): 1077, ('a', 'DT'): 18743, ('t', ...","{'data': 735, 'scienc': 175, 'at': 153, 'the':...",0.0,0
5,DomesticWireFunds.pdf,[ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...,"[[►►►►►PLEASE, PRINT◄◄◄◄◄, WIRE, TRANSFER, PAY...","[[(►►►►►PLEASE, NN), (PRINT◄◄◄◄◄, NNP), (WIRE,...","[wire, transfer, payment, order, confirmation,...","{'wire': 1, 'transfer': 3, 'payment': 3, 'orde...","[(►►►►►PLEASE, NN), (PRINT◄◄◄◄◄, NNP), (WIRE, ...","{('►►►►►PLEASE', 'NN'): 1, ('PRINT◄◄◄◄◄', 'NNP...","[►►►►►pleas, print◄◄◄◄◄, wire, transfer, payme...","[[(►, JJ), (►, NNP), (►, NNP), (►, NNP), (►, N...","[(►, JJ), (►, NNP), (►, NNP), (►, NNP), (►, NN...","{('►', 'JJ'): 1, ('►', 'NNP'): 4, ('p', 'NN'):...","{'►►►►►pleas': 1, 'print◄◄◄◄◄': 1, 'wire': 1, ...",0.0,0
6,dubai 1 2.pdf,[ Arab Social Media Report: Civil Movements: T...,"[[Arab, Social, Media, Report, :, Civil, Movem...","[[(Arab, JJ), (Social, NNP), (Media, NNP), (Re...","[arab, social, media, report, civil, movements...","{'arab': 121, 'social': 84, 'media': 65, 'repo...","[(Arab, JJ), (Social, NNP), (Media, NNP), (Rep...","{('Arab', 'JJ'): 96, ('Social', 'NNP'): 25, ('...","[arab, social, media, report, :, civil, moveme...","[[(a, DT), (r, NN), (a, DT), (b, NN)], [(s, NN...","[(a, DT), (r, NN), (a, DT), (b, NN), (s, NN), ...","{('a', 'DT'): 3916, ('r', 'NN'): 2959, ('b', '...","{'arab': 142, 'social': 84, 'media': 65, 'repo...",0.0,0
7,Early social interaction project for childen w...,"[ 03., Wetherby p67 TECSE 26:2 67–82 (2006) 67...","[[03, .], [Wetherby, p67, TECSE, 26:2, 67–82, ...","[[(03, CD), (., .)], [(Wetherby, NNP), (p67, N...","[wetherby, tecse, early, social, interaction, ...","{'wetherby': 31, 'tecse': 1, 'early': 76, 'soc...","[(03, CD), (., .), (Wetherby, NNP), (p67, NN),...","{('03', 'CD'): 1, ('.', '.'): 538, ('Wetherby'...","[03, ., wetherbi, p67, tecs, 26:2, 67–82, (, 2...","[[(0, CD), (3, CD)], [(., .)], [(w, NN), (e, N...","[(0, CD), (3, CD), (., .), (w, NN), (e, NN), (...","{('0', 'CD'): 475, ('3', 'CD'): 223, ('.', '.'...","{'03': 1, '.': 538, 'wetherbi': 31, 'p67': 1, ...",0.0,0
8,eng[1].htm,"[ The Secrets of the Qur’an’s Miracles, Websit...","[[The, Secrets, of, the, Qur, ’, an, ’, s, Mir...","[[(The, DT), (Secrets, NNS), (of, IN), (the, D...","[secrets, qur, miracles, website, abduldaem, s...","{'secrets': 11, 'qur': 8, 'miracles': 10, 'web...","[(The, DT), (Secrets, NNS), (of, IN), (the, DT...","{('The', 'DT'): 33, ('Secrets', 'NNS'): 2, ('o...","[the, secret, of, the, qur, ’, an, ’, s, mirac...","[[(t, NN), (h, NN), (e, NN)], [(s, NN), (e, NN...","[(t, NN), (h, NN), (e, NN), (s, NN), (e, NN), ...","{('t', 'NN'): 344, ('h', 'NN'): 178, ('e', 'NN...","{'the': 81, 'secret': 14, 'of': 50, 'qur': 8, ...",0.0,0
9,"Kaplan, Andreas - Users of the world, unite.pdf","[ Author's personal copy Users of the world, u...","[[Author, 's, personal, copy, Users, of, the, ...","[[(Author, NN), ('s, POS), (personal, JJ), (co...","[author, personal, copy, users, world, unite, ...","{'author': 12, 'personal': 22, 'copy': 11, 'us...","[(Author, NN), ('s, POS), (personal, JJ), (cop...","{('Author', 'NN'): 2, (''s', 'POS'): 10, ('per...","[author, 's, person, copi, user, of, the, worl...","[[(a, DT), (u, JJ), (t, NN), (h, NN), (o, NN),...","[(a, DT), (u, JJ), (t, NN), (h, NN), (o, NN), ...","{('a', 'DT'): 2770, ('u', 'JJ'): 758, ('t', 'N...","{'author': 12, ''s': 10, 'person': 28, 'copi':...",0.0,0


In [None]:
# Find words in context with POS
contextkeywords(d)

In [None]:
# Sort by scoring
d = d.sort_values('score', ascending=False)

In [None]:
# Print sorted documents
print('\n')
print('Here are the scores based on cleansed data:')
print(d[['document', 'score']])

In [None]:
dirtyscoring(d)

d = d.sort_values('score2', ascending=False)
print('\n')
print('Here are the scores based on uncleansed data:')
print(d[['document', 'score2']])

In [None]:
# Print results of K Means Cluster and prediction modelling
clustering(doclist)

# Print results of NMF vs LDA topic modelling
nmflda(doclist)