In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.stem import PorterStemmer
import spacy
import en_core_web_sm  # or any other model you downloaded via spacy download or pip

nlp = en_core_web_sm.load()

In [2]:
pstemmer = PorterStemmer()

input_path = 'C:\\t2'
stop_words = set(stopwords.words('english'))
keywords = ['IS', 'terrorism', 'bomb', 'is', 'the', 'consortium']
filterkeywords = [w for w in keywords if w not in stop_words]
poskeywords = nltk.pos_tag(filterkeywords)

# If the first keyword is a verb, move it and reparse the list
if poskeywords[0][1] == 'VBZ':
    filterkeywords.insert(1, filterkeywords.pop(0))
    poskeywords = nltk.pos_tag(filterkeywords)

stemkeywords = nltk.pos_tag([pstemmer.stem(t) for t in filterkeywords])


# Set up Dataframe
d = pd.DataFrame()

# Create a list to use for clustering
doclist = []
word_matches = defaultdict(list)
globalents = []

In [3]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [4]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [5]:
# Get parts of speech from SpaCy
def pos(x):
    return [(token.text, token.tag_) for token in x]


def spacy_pos(x):
    pos_sent = []
    for sentence in x:
        processed_spacy = nlp(sentence)
        for ent in processed_spacy.ents:
            globalents.append((ent.text, ent.label_))
        pos_sent.append(pos(processed_spacy))
    return pos_sent

In [6]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item) for item in x]))
    dataframe['pos'] = dataframe['sentences'].map(spacy_pos)
    dataframe['allwords'] = dataframe['words'].apply(lambda x: [item.strip(string.punctuation).lower()
                                                                for sublist in x for item in sublist])
    dataframe['allwords'] = (dataframe['allwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                                    and item not in stop_words]))
    dataframe['mfreq'] = dataframe['allwords'].apply(nltk.FreqDist)
    dataframe['poslist'] = dataframe['pos'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreqpos'] = dataframe['poslist'].apply(nltk.FreqDist)
    dataframe['stemwords'] = dataframe['words'].apply(lambda x: [pstemmer.stem(item) for sublist in x
                                                                 for item in sublist])
    dataframe['stemwords'] = (dataframe['stemwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                                      and item not in stop_words]))
    dataframe['mfreqstem'] = dataframe['stemwords'].apply(nltk.FreqDist)

    return dataframe

In [7]:
# Score documents based on cleansed dataset - so should discount stopwords and be sensible
def scoring(dataframe, list):
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['allwords']:
                if not row['document'] in list[word]:
                    list[word].append(row['document'])
                    dataframe.loc[idx, 'score'] += (row['mfreq'][word] * 0.75)
    return dataframe

In [8]:
# Score documents based on pos - should be most exact match
def scoringpos(dataframe, list):
    for (w1, t1) in poskeywords:
        for idx, row in dataframe.iterrows():
            if (w1, t1) in row['poslist']:
                if not row['document'] in list[w1]:
                    list[w1].append(row['document'])
                    dataframe.loc[idx, 'score'] += row['mfreqpos'][(w1, t1)]
    return dataframe

In [9]:
# Score documents based on cleansed dataset - so should discount stopwords and be sensible
def scoringstem(dataframe, list):
    for word in stemkeywords:
        for idx, row in dataframe.iterrows():
            if word in row['stemwords']:
                if not row['document'] in list[word]:
                    list[word].append(row['document'])
                    dataframe.loc[idx, 'score'] += (row['mfreqstem'][word] * 0.5)
    return dataframe

In [10]:
# Find keywords using POS
def contextkeywords(dataframe):
    print('\n')
    print('Here are the exact keyword matches in context: ')
    for (w1, t1) in poskeywords:
        for idx, row in dataframe.iterrows():
            for index, r in enumerate(row['pos']):
                if (w1, t1) in r:
                    print(row['document'] + ' - ' + ' '.join(row['words'][index]))
    return dataframe

In [11]:
# Sort using a dirty model
def dirtyscoring(dataframe):
    dataframe['score2'] = 0
    dataframe['w2'] = dataframe['words'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreq2'] = dataframe['w2'].apply(nltk.FreqDist)

    word_matches = defaultdict(list)
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['w2']:
                dataframe.loc[idx, 'score2'] += row['mfreq2'][word]
                if not row['document'] in word_matches[word]:
                    word_matches[word].append(row['document'])
    print('\n')
    print('The following keyword hits occurred in the uncleansed data:')

    for key, val in word_matches.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

    return dataframe

In [12]:
def printkeywordmatches(list):
    for key, val in list.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

In [13]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [pstemmer.stem(t) for t in filtered_tokens]
    return stems

In [14]:
# Cluster documents and demonstrate prediction
# TODO - calculate ideal k value
def clustering(documents):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.2, use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    X = vectorizer.fit_transform(doclist)

    true_k = 5
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)

    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i),
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind]),
        print

    print("\n")
    print("Prediction")

    Y = vectorizer.transform(["this is a document about islamic state "
                              "and terrorists and bombs IS jihad terrorism isil"])
    prediction = model.predict(Y)
    print("A document with 'bad' terms would be in:")
    print(prediction)

    Y = vectorizer.transform(["completely innocent text just about kittens and puppies"])
    prediction = model.predict(Y)
    print("A document with 'good' terms would be in:")
    print(prediction)


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


def nmflda(documentlist):
    no_features = 1000

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documentlist)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documentlist)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 5

    # Run NMF
    nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    no_top_words = 10
    print("NMF Topics: ")
    display_topics(nmf, tfidf_feature_names, no_top_words)
    print('\n')
    print("LDA Topics: ")
    display_topics(lda, tf_feature_names, no_top_words)

In [15]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
print('Starting processing - the following files have been processed:')
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    # Ignore any documents with <100 words
    if len(parsed) < 100:
        continue

    # Create doclist for use in topic modelling
    doclist.append(parsed)
    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

print('\n')
d.reset_index(drop=True, inplace=True)
d.columns = ['document', 'sentences']


# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)
d['score'] = 0

# Now we score in a calculated manner:
# Score 1 for matching word (case sensitive and POS)
scoringpos(d, word_matches)
# Score 0.75 for matching word (case insensitive,  stop words removed)
scoring(d, word_matches)
# Score 0.5 for matching stem of word (case insensitive, stop words removed)
scoringstem(d, word_matches)
# Print out the results of keyword matching
printkeywordmatches(word_matches)
# Find words in context with POS
contextkeywords(d)

# Sort by scoring
d = d.sort_values('score', ascending=False)

# Print sorted documents
print('\n')
print('Here are the scores based on cleansed data:')
print(d[['document', 'score']])

Starting processing - the following files have been processed:
01,-,Good,bank,statement.pdf
031918comments2.authcheckdam.pdf
881961_CHECKLIST-2014_rev62714.pdf
bank-reconciliation-example.pdf
Bishop_Book_4_eBook.pdf
britain_mag_media_pack.pdf
c07Chemicalreactions_WEB.pdf
cassandra_thedefinitiveguide.pdf
children result( Individula and together ) v1 7-3-16.docx
Correct bank statement.pdf
D3S_EN.pdf
datascienceatthecommandline.pdf
dis5790_parrainage_mmf_a5_4.pdf
DomesticWireFunds.pdf
DTM_AprMay_2018.pdf
dubai 1 2.pdf
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
eng[1].htm
eula.1036.txt
Factors-Affecting-Rate-of-Reaction.pdf
Fireworks!-ConcertInPark08.pdf
HERO5Black_UM_ENG_REVC_Web.pdf
iphone  en.pdf
Kaplan, Andreas - Users of the world, unite.pdf
Kuwait job.docx
learningspark.pdf
log.txt
manual_charge_2_en_US.pdf
Memes-and-the-evolution-of-religion-We-need-memetics-too.pdf
Mohamed Salem  Religion, Spirituality and Psychiatry.pdf

Here are the exact keyword matches in context: 
Religion-Security-Global-Uncertainties.pdf - This report offers a synopsis of the research findings from Phase 1 of this project , which examined the relationship between religion and security , including terrorism and so- called ‘ religious violence ’ .
Religion-Security-Global-Uncertainties.pdf - This paper offers a synopsis of the findings from phase 1 of this project.1 This phase examines the relationship between religion and security , including terrorism and so-called ‘ religious violence ’ .
Religion-Security-Global-Uncertainties.pdf - Writing about modern suicide terrorism , Robert Pape ( 2005 ) has emphasised the communicative power of violence – those who are willing to die for a cause often do so in hope that their death will have a shock value that allows their message to reach a wider public .
Religion-Security-Global-Uncertainties.pdf - In particular there is the question of the appropriate balance between ‘ hard ’ and ‘ sof

STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf - Using data provided to the Department of State , these groups were attributed responsibility for approximately 5,000 fatalities : the Taliban ( more than 2,000 fatalities ) , Boko Haram ( more than 1,100 fatalities ) , al‐Qa ’ ida in Iraq ( more than 830 fatalities ) , Tehrik‐e Taliban Pakistan ( more than 500 fatalities ) , al‐Qa ’ ida in the Arabian Peninsula ( more than 280 fatalities ) , and al‐Shabaab ( more than 280 fatalities ) .5 Based on preliminary terrorism incident data for January through June of 2013 , and again using the Department of State ’ s inclusion standards , the eight most lethal organizations in that time‐period include the Taliban , al‐Qa ’ ida in Iraq , Tehrik‐i‐Taliban Pakistan , Boko Haram , Lashkar‐e‐Jhangvi , al‐Nusrah Front , al‐Shabaab , and al‐Mua ’ qi ’ oon Biddam Brigade .
STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf - To help interpret these data on terro

START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - mailto : fajmonova.veronika @ gmail.com mailto : cmccaule @ brynmawr.edu mailto : infostart @ start.umd.edu http : //www.start.umd.edu/ National Consortium for the Study of Terrorism and Responses to Terrorism A Department of Homeland Security Science and Technology Center of Excellence Origin-group differences in the 2007 and 2011 Pew Polls of U.S. Muslims : Reactions to the War on Terrorism Contents Executive Summary ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... .. 1 Introduction ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... . 1 Comparing origin groups in Pew polls 

START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Yes 3 2 2 1 1 2 2 3 4 6 2 1 Do you think that government´s anti-terrorism policies single out Muslims in the U.S. for increased surveillance and monitoring , or don´t you think so ?
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Opinions relating to the war on terrorism Table 5 shows that in both 2007 and 2011 about half of U.S. Muslims ( 2007 49-81 % ; 2011 39-50 % ) did not believe that the war on terrorism ( WOT ) is a sincere effort to reduce international terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Two groups showed a substantial decline in doubts about the war on terrorism ( African Americans 81 % in 2007 vs. 50 % in 2011 , Iranians 66 % in 2007 vs. 40 % in 2011 ) ; indeed every origin group showed a numeric decline in doubts about war on terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - The three terrorism-related items have non-negligible missing rates , raising the possibility that responses may

START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Decreasing discrimination experienced by U.S. Muslims is a worthy goal in its own right , but , on our interpretation , only change in perception of U.S. military intervention in predominantly Muslim countries will affect opinions of the war on terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - If they come to favor U.S. intervention against Islamic State , for instance , then U.S. Muslims may become more favorable toward the war on terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - African-American Muslims were also more likely to take part in non-prayer activities at mosque or Islamic Center , more likely to see a conflict between being a devout Muslim and living in a modern society , more dissatisfied with how things are going in the U.S. , and more likely to think that government ’ s anti-terrorism policies single out Muslims for increased surveillance .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Notably

START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - This deliverable is part of the National Consortium for the Study of Terrorism and Responses to Terrorism ( START ) project , “ Tracking Attitudes within American Subcultures. ” This research was supported by the Department of Homeland Security Science and Technology Directorate ’ s Office of University Programs through Award Number 2012-ST-061-CS0001 , Center for the Study of Terrorism and Behavior ( CSTAB ) 2.12 made to START to investigate the understanding and countering of terrorism within the U.S .
START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - START uses state‐of‐the‐art theories , methods and data from the social and behavioral sciences to improve understanding of the origins , dynamics and social and psychological impacts of terrorism .
START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - Opinions about the War on Terrorism and Suicide Bombing The survey also included severa

START_DHS_SyriaBarometerSurvey_30June2016.pdf - Participants who held pro‐Assad government position on the Syrian War—that the United States , ISIS , and Turkey were to blame—were more likely to agree that the war on terrorism is a war on Islam ; that U.S. foreign policy is controlled by Jewish interests ; and to reject the notion of a United Nation of Islam .
START_DHS_SyriaBarometerSurvey_30June2016.pdf - Thus , disagreeing that the war on terrorism is a war on Islam , that U.S. foreign policy is controlled by the Jewish interests , and agreeing that there should be a united Nation of Islam was related to blaming Assad and Russia for the war in Syria , and exculpating the United States and Turkey .
START_DHS_SyriaBarometerSurvey_30June2016.pdf - Do you feel the U.S. is fighting a war on terrorism or a war against Islam ?
START_DHS_SyriaBarometerSurvey_30June2016.pdf - War on terrorism 2 .
START_DHS_SyriaBarometerSurvey_30June2016.pdf - Do you feel the U.S. is fighting a war on terror

START_Smith_GeospatialTemporalPatternsofLoneActorTerrorism.pdf - Terrorism Research Center in Fulbright College University of Arkansas September 19 , 2014 National Consortium for the Study of Terrorism and Responses to Terrorism Methodology : Since its inception in 1988 , the ATS has used official federal terrorism-related court cases to construct a dataset for the analysis of terrorism incidents and preventions , precursor activities of these incidents , and subsequent court cases .
START_Smith_GeospatialTemporalPatternsofLoneActorTerrorism.pdf - Sources of data include : • Court case documents from federal indictments resulting from official FBI terrorism investigations .
START_TerrorismEnergyAttacks_ResearchBrief_June2015.pdf - LOCATIONS AND ATTACK TYPES As is the case with terrorism in general , attacks on energy- and mining-related targets are geographically concentrated .
START_TerrorismEnergyAttacks_ResearchBrief_June2015.pdf - START uses state‐of‐the‐art theories , methods and 

START_UnderstandingLoneActorTerrorism_ResearchHighlight_Oct2013.pdf -  Unlike group-based terrorism and violent hate crimes , lone-actor terrorism is not more likely to occur in counties with higher percentages of residents living in urban environments , higher percentages of male residents between 15 and 24 years of age , or higher unemployment rates .
START_UnderstandingLoneActorTerrorism_ResearchHighlight_Oct2013.pdf -  Overall , locations where lone-actor terrorism occurs tend to share more demographic similarities with the locations of violent hate crime offending than with the locations of group-based terrorism .
START_UnderstandingLoneActorTerrorism_ResearchHighlight_Oct2013.pdf - START Research Highlight © START , October 2013  It may be possible to learn more about where and when lone-actor terrorism occurs by examining patterns in violent hate crime , a type of violence that both academics and practitioners understand more fully .
eula.1036.txt - THE SOFTWARE IS LICENSED “

[87 rows x 2 columns]


In [None]:
dirtyscoring(d)

d = d.sort_values('score2', ascending=False)
print('\n')
print('Here are the scores based on uncleansed data:')
print(d[['document', 'score2']])

# Print results of K Means Cluster and prediction modelling
clustering(doclist)

# Print results of NMF vs LDA topic modelling
nmflda(doclist)

In [29]:
print('People discovered:')
for (a,b) in globalents:
    if b == 'PERSON':
        print(a)

People discovered:
David Kautter
Kautter
Karen L. Hawkins
William M. Paul
Counsel
David Kautter
Tax Policy
Thomas West
Counsel
Scott Dinwiddie
Counsel
Donna Welsh
Branch
Kathryn Zuba
Counsel
Helen Hubbard
Counsel
Karl Walli
Eric Solomon
Charles P. Rettig
Fred F. Murray
Julian Y. Kim
Outreach Bahar A. Schippel
Katherine E. David
Robb A. Longman
Richard M. Lipton Chicago
William H. Caudill
John F. Bergner
Thomas D. Greenaway Boston
Eugene
Carol P. Tello
Sheri A. Dillon
Peter A. Lowy
John A. Thorner
Omri Marian
Kerry Ryan
Adam Chodorow
James Creech
Elizabeth Crouse
Diane Ring
Lisa Zarlenga
Lisa Zarlenga
Omri Marian
Kerry A. Ryan
David Farmer
Cryptocurrencies
Classic Ethereum
Coinbase
Coinbase
Xapo
David Farmer
Coinbase
8 David Farmer
Eisner v. Macomber
disposal.”13 In Macomber
Macomber
Glenshaw Glass
Reg
F. Supp
Haverly v. United States
Reg
Reg
Macomber
Metz
Gamble v. Commissioner
Reg
Reg
Laura Shin
FORBES
Per Submittal
FACTORY
Matte
DISTRIBUTOR
FOB Hayward
Mrs Jones
2,399.92 Cash book ba

Mary
Mary
Mary
Cassandra
Cassandra
Mary
E. Broadway'
Mary
zip_code int
Secondary Indexes
Cassandra
Nguyen'
Nguyen
first_name
Bill | Nguyen
Cassandra’s
Cassandra
Cassandra’s
Cassandra
Cassandra
Peter Chen
Cassandra
Cassandra Of
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Codd’s
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Artem Chebotko
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra’s
Cassandra
Npk
St
St
Cassandra
Cassandra
Format As
Cassandra’s
Database Schema
Identify Partition
Database Schema
DataStax DevCenter
Cassandra’s
Cassandra
Cassandra’s
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Alan Demers
Cassandra
Cassandra
Gossiper
Cassandra
Cassandra
Naohiro Hayashi‐
al
Cassandra
Phi
Cassandra
Cassandra
Cassandra
Cassandra
Amazon EC2
Apache Cloudstack
Cassandra
DynamicEndpointSnitch
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Austin Appleby
Cassandra’s
Cassandra’s
Cassandra
Cassandr

Incremental Backup
Cassandra’s
Cassandra
Cassandra
Snapshot The
Cassandra
Backup
Cassandra
Cassandra
Cassandra’s
Cassandra
Maintenance Several
LeveledCompactionStrategy
Cassandra’s
Cassandra
Cassandra
DataStax OpsCenter
Netflix Priam
Cassandra
Cassandra
Netflix
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra’s StorageProxy
Percentile Read Latency
Percentile
Count
Cassandra
Cassandra
Session.execute
Bloom
Bloom
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra’s
Row Cache
Cache Settings Cassandra
Cassandra
Memtables Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Java NIO
Cassandra
Commit Logs
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra’s
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Handoff Hinted
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Threading Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra
Cassandra

Field Preview
Programmed Auto
Frame
Long
Z Shutter-Speed
Main
Shutter Speed
Aperture The new
Assign AE-L
Flash Bracketing
Frame
Flash Bracketing
Frame
Active D-Lighting
Active D-Lighting
Active D-Lighting
Extra High
Active D-Lighting
Active D-Lighting
Frame
Active D-Lighting
White Balance
Green
Blue
Amber
K. Mired
Flash
Method Description
Viewfinder SHOOT CUSTOM
Viewfinder Top
White Balance
Highlight Preset
Highlight Copy
White Balance
Photograph
Highlight Preset
Highlight Preset
Highlight Preset
Active D-Lighting
Cyanotype
Blue Green
Blue
Purple Blue
Active D-Lighting
Active D-Lighting
Monochrome
Green Softens
Highlight Save
Rename
ViewNX
Delete
Monochrome
YAuto
Active D-Lighting
YAuto
Active D-Lighting
Active D-Lighting
Exif
Exif
Wireless Speedlight
SB-400 SB-R200
Auto FP
SB-600 SB-R200
GN Distance-priority
— Flash
Y Red-eye
Flash Units
SB-15 SB-23
Flash Units
Flash
Flash Mode
Aperture See
Assign Fn
Assign
Assign AE-L
Assign AE-L
Off Shutter-speed
Highlight Multiple
Highlight Multipl

Almeida
Matos
Hall
Frank
Holmes
Pfahringer
Reutemann
Witten
I. H. (
• Pearson
• Van der Maaten
G. E. (
Recap This
Mason
Chapters
Russell
M.
2nd Ed.
Warden
Peek
Powers
O’Reilly
Goyvaerts
Cooper
M.
Advanced Bash-Scripting Guide
Robbins
Beebe
N. H. F. (
Python
Wickham
Springer
McKinney
Python
Rossant
Patil
D. J.
Define
Mawk
Mike Brennan
Regions
Endpoint
RegionName
GNU Bourne-Again SHell
Brian Fox
Chet Ramey
Philip A. Nelson
Jeroen H.M. Janssens
Torbjorn Granlund
Richard M. Stallman
David MacKenzie
Jim Meyering
Jeroen H.M. Janssens
Tony Monroe
Torbjorn Granlund
David MacKenzie
Jim Meyering
Christopher Groskopf
Filter
Christopher Groskopf
Merge
Christopher Groskopf
Christopher Groskopf
Christopher Groskopf
Christopher Groskopf
Stack
Christopher Groskopf
Christopher Groskopf
Daniel Stenberg
Decklin Foster
David M. Ihnat
David MacKenzie
Jim Meyering
Brian Fox
Chet Ramey
Richard Mlynarik
David MacKenzie
Generate
Dima Kogan
Jeremy Hinds
Jason Gessner
Jim Renwick
Norman Gocke
Rodofo Granata
Tobi

Arndt
Sparks et al
Mundy
Burnette
Allied Disciplines
Text
Charlop
Walsh
Cohen
Erlbaum
Courchesne
Karns
Davis
Ziccardi
Carper
Davidovitch
Glick
Holtzman
M. P. (
Developmental Disorders
Dawson
M. Guralnick
Brookes
Dunlap
Fox
L.
Young Children
Dunst
Hamby
Raab
M. B.
Fenske
Zalenski
Krantz
Girolametto
Goldberg
W. A., Osann
P. A., Laulhere
Jarvis
Modahl
Developmental Disorders
Developmental Disorders
Hancock
Kashinath
H. (
Koegel
R. L.
Bimbela
Lainhart
Piven
Adolescent Psychiatry
Laski
Lord
Risi
Erlbaum
Lord
Rutter
DiLavore
Lord
Shulman
Allied Disciplines
Lovaas
Clinical Psychology
Mahoney
Mandell
Novak
McGee
Daly
Mervis
C. B.,
B. P. (
Developmental Disorders
S. K. (
Mullen
Mullen
Circle Pines
Mundy
Burnette
F. Volkmar
R. Paul
A. Klin
Mundy
Sigman
Developmental Disorders
Piven
Arndt
Adolescent Psychiatry
F. Volkmar
R. Paul
A. Klin
Wetherby
Brookes
Sandall
M. L., Smith
B. J.,
K. M., Bailey
D. B.
R. J.
Schwartz
Sandall
McBride
W. R.
Cook
T. D.
D. T. (
Shinnar
Rapin
Arnold
Tuchman
R. F.
Shulma

Patrick Wendell
Databricks
Spark’s
Matei Zaharia
Holden Karau
Andy Konwinski
Scala
Holden Karau
Packt Publishing
Andy Konwinski
Patrick Wendell
Databricks
Spark’s
Matei Zaharia
Holden Karau
Andy Konwinski
Holden Karau
Andy Konwinski
Patrick Wendell
Holden Karau
Andy Konwinski
Patrick Wendell
Databricks
Ann Spencer
Marie Beaugureau
Rachel Monaghan Proofreader
Charles Roumeliotis Indexer
Ellen Troutman
David Futato
Ellie Volckhausen Illustrator
Rebecca Demarest February
Third Release
Preface
Apache Spark
Stack
Releases 7
Downloading Spark
’s Python
Python
Scala
Driver
Maven
Debugging Spark
Jobs
Driver
Batch
Hadoop MapReduce
Matei Zaharia
Patrick Wendell
Andy Konwinski
Holden Karau
Stoica
Apache Spark
Spark
Spark
Spark’s
Python
Python
Python
Spark
Books If
Python
Learning Python
Python
Dive
Python
Python
Spark’s
Python
Holden Karau
Andy Konwinski
Patrick Wendell
Matei Zaharia
Sams
Morgan Kaufmann
Facebook:
Maven
Acknowledgments The
Joseph Bradley
Dave Bridgeland
Chaz Chandler
Mick Davies


Spark Streaming
Scala ipAddressRequestCount.saveAsTextFiles("outputDir"
Long>
Long>
Text(e._1
LongWritable(e._2
LongWritable
foreachRDD
foreachRDD
foreachRDD
Scala ipAddressRequestCount.foreachRDD
Hadoop
Spark Stream‐
Hadoop Input
y.get
Spark Streaming
Integer>
topicLines.print(
createStream
topicsSet.put("logs"
KafkaUtils.createDirectStream(jssc
Flume
Avro
Spark Streaming
Spark Streaming
Cluster Sizing
Amazon S3.3
Spark Streaming
Amazon S3
Spark Streaming
StreamingContext.getOrCreate
JavaStreamingContextFactory(
Durations.seconds(1
JavaStreamingContextFactory(
getOrCreate
Spark Streaming
Spark Streaming
StreamingContext.hadoop Files
<driver>:4040
Batch
Spark Streaming
Weka
MLlib
Python
Python
Python
Spam Classification
Python
LabeledPoint>
tf.transform(Arrays.asList(email.split
LabeledPoint>
tf.transform(Arrays.asList(email.split
LogisticRegressionWithSGD().run(trainData.rdd
Arrays.asList("Hi Dad
Scala
Python
Python
Python
Python
Python
HashingTF
Python
Regression Classification
MLlib

Watkins
McFarlane
Eweida
Chaplin
Chaplin
McFarlane
Eweida
Chaplin
Leyla Şahin
Jakóbski
Kovaļkovs
Mark Hill
Russell Sandberg
Sandberg
Sandberg
Sandberg
McFarlane
Eweida
Chaplin
Paul Ricoeur
Tzvetan Todorov
Paul Tillich
Karl Rahner
Kant
Schelling
Carolyn Evans
Evans
Khan
Jakóbski
Kovaļkovs
Donald et al
Daniel
Daniel J. Hill
Hill
Peter
Richard O’Dair
Andrew Lewis
Donald
Alice
Evans
Carolyn
Chaplin
McFarlane
Mark
Russell Sandberg
Aut
Mark
Russell Sandberg
Norman Doe
Law
Javier
Richard O’Dair
Andrew Lewis
Karl
Kevin Smyth
Ricoeur
Paul
Don Ihde
Paul
John Clayton
De Gruyter
Catherine Porter
Catherine Porter
Christian Concern
Adel Ahmed
Zarin Avari
Christopher Baker
Andrew Brower Latz
Jenny Bunker
Harry Bunting
Clare Carlisle
Rebecca Catto
Shirley Chaplin
Stephen Clark
Charles Clarke
Frank Cramner
Andrew Crompton
Sarah Egan
Richard Gaskin
Matthew Gibson
Elaine Graham
Simon Hailwood
James Harding
Paul Helm
Chris Hewson
Mark Hill
Gillian Howie
Patrick Kelly
Christina Kennedy
Peter Kennedy
Ram Kr

KeyboardInterrupt: 

In [28]:
print('Organisations discovered:')
for (a,b) in globalents:
    if b == 'ORG':
        print(a)
        

Organisations discovered:
UKBA
UKBA
NW Washington
DC 20036
Internal Revenue Service
NW Washington
DC 20024
Tax Treatment
the American Bar Association Section of
the House of Delegates
the Board of Governors
the American Bar Association
Section of Taxation Enclosure
Internal Revenue Service
Department of the
Tax Legislative Counsel
Department of the Treasury
Associate Tax Legislative Counsel
Department of the Treasury Drita Tonuzi
Operations
Internal Revenue Service
Internal Revenue Service
Senior Technician Reviewer
Internal Revenue Service
Internal Revenue Service
Internal Revenue Service
Financial Products
Department of the Treasury OFFICERS
OR Chair-
DC Vice Chairs
CA Committee Operations
FL Government
AZ Publications Julie A.
CA
MD COUNCIL Section Delegates
the House of
IL Armando Gomez
DC Last Retiring
MA Roberta F.
AL Christopher S.
DC Gregg D.
WA Michael J.
CA Catherine B. Engell
TX LIAISONS Board of Governors Allen C. Goolsby
VA Young Lawyers Division Vlad
NY Law Student Divisi

Cluster 244 Handling Node
Backup and
SSTable Utilities
Maintenance Tools
Netflix Priam
Table of Contents Caching
Row Cache
SSL
TLS
Node-to-Node
JMX Security
Securing JMX
Integrating
Storage 306 Network
Amazon Web Services
Microsoft
Google Cloud Platform
Table of Contents
Apache Lucene
SOLR
Apache Hadoop
Apache Spark
| Table of Contents Foreword
Microsoft
the Apache Incubator
Facebook’s
Cassandra
CTO
DataStax xiv
The Definitive Guide
Apache Cassan‐
Apache Cassandra
Apache Cassandra
Apache Cassandra
Cas‐
Apache Cassandra
JIRA
Apache
Apache Cassandra Project
Facebook, Twitter
Netflix
PHP
Apache
Cassan‐
| Preface
The Cassandra Query Language
Data Modeling
The Cassandra Architecture
PHP
API
Java Management Extensions
Microsoft
Google
Apache Cassandra
the Second Edition
The Definitive Guide
Cassandra
CQL
CQL
Preface |
ISBN
The Definitive Guide
Sec‐ ond Edition
O’Reilly Safari Safari
Safari Books Online
Learning Paths
O’Reilly Media
Harvard Business Review
Prentice Hall Professional
Addison-W

Xerox’s Palo Alto Research Center
Phi Accrual Failure Detection
the Advanced Institute of Science and Technology
Threshold
Accrual Failure Detectors
Cassandra
AFD
Phi Accrual Failure Detection
’s
Hayashibara
The Cassandra Architecture
SimpleSnitch
Cassandra
Data
The Cassandra Architecture Cassandra
num_tokens
DHT
Murmur3Partitioner
RandomPartitioner
Replication Strategies A
Four Strategy
SimpleStrategy
NetworkTopologyStrategy
SimpleStrategy
NetworkTopologyStrategy
Legacy Replication Strategies
OldNetworkTopologyStrategy
the RackAware Strategy
SimpleStrategy
The Cassandra Architecture Consistency Levels
THREE
QUORUM
THREE
QUORUM
The Cassandra Architecture Memtables
SSTables
JVM
Cassandra
Google’s Bigtable
Sorted String Table
Cassandra
The Cassandra Architecture On
JVM
Post
ANY
Amazon’s Dynamo
the Java Message Service
JMS
JMS
JMS
The Cassandra Architecture Cassandra’s LWT
Two-Phase Commit
Paxos
Paxos
the Storage Service
SQL
Garbage Collection Grace Seconds
Bloom Filters Bloom
The Cassand

RuntimeException(e
OperationType
COMPACTION
VALIDATION
CLEANUP
SCRUB
INDEX_BUILD
JMX
Storage Service MBean
OperationMode
UnreachableNodes
DEBUG
JConsole
org.apache.cassandra.gms.Gossiper
INFO
INFO
INFO
INFO
DEBUG
Storage Proxy
ColumnFamilies
JMX
CompactionMan
HintsService
Cassan‐
Gossiper
JMX
CQL
Cache
Thread
JMX
MigrationStage
the ReadRepairStage MBean
JVM
Monitoring Monitoring
JMX
JMX
NodeProbe
JMX
NodeProbe
JMX
NodeCmd
Cassandra
Cassandra
Tokens Owns Host ID
UN
UN
Thrift
Native Transport
MB Generation No
Uptime
2.94 Data Center
MB
MB
Staged Event-Driven Architecture
MutationStage
Keyspace
SSTable Compression Ratio
Memtable
Index
JMX
Cassandra
DataStax
Health Check
ERROR
WARN
GB
Network Time Protocol
Cassandra
Basic Maintenance
DEBUG
RMI TCP Connection(297)-127.0.0.1
CF reservations_by_hotel_date
Basic Maintenance
hotels_by_poi
Data.db Basic Maintenance
org.apache.cassandra.dht.Murmur3Partitioner Bloom Filter FP
the OpsCenter Repair Service
Basic Maintenance
Best Practices for Repair

SchemaChangeListener
SchemaStatement
Cassandra
JMX
NoSQL
Scalable Internet Services
SeedProvider
SELECT
QueryBuilder
TTL
Cassandra
Cassandra
CassandraDaemon
StorageProxy
DataStax Java
Index SHOW
Simple Network Monitoring Protocol
SimpleSnitch
SimpleStrategy
SNAPPY
SimpleSnitch
RDBMS
Spark
SQL (
Structured Query Language
CQL
SSTable Attached Secondary Index
SASI
BoundStatement
PreparedStatement
SimpleStatement
Stonebreaker
Storage Area Networks
StorageProxy
Tablesnap
SQL Server
SEDA
Cassandra
API
TLS (Transport Layer Security
TokenAwarePolicy
TTL
8 Transport Layer Security
CQL
U UDAs
UnreachableNodes
UPDATE
INSERT
USING TIMESTAMP
DROP USER
USING TIMESTAMP
UPDATE
INSERT
UPDATE
WARN
SELECT
QueryBuilder
CREATE MATERIALIZED VIEW
SELECT
Windows
DataStax Community Edition
Choice Hotels International
Choice Hotels International
The Definitive Guide
Cassell’s Natural History
URW Typewriter
Guardian
Adobe
Adobe Myriad Condensed
Cover Copyright Table of
Relational Databases
Relational Databases
Q

Fine-Tuning White Balance
Preset Manual
Image Enhancement
Picture Controls
Picture Control
Modifying Existing Picture Controls
Custom Picture Controls
Custom Picture Controls
Color Space
The Nikon Creative Lighting System
TTL Flash Control
Flash Modes
Interval Timer Photography
More About
Photo Information
The Playback Menu
Recording Voice Memos
Playing Voice Memos
Direct USB Connection
Wireless and Ethernet Networks
Time
Printing Multiple Pictures
DPOF Print Order
Standard Definition Devices
Menu Guide
D The Playback Menu
Playback Folder
Shooting Menu Bank
Extended Menu Banks
Vignette Control
Fine-Tuning Camera Settings
Reset Custom Settings
AF-S Priority Selection
Dynamic AF Area
Focus Point Wrap-Around
AF Point Selection
ISO
Easy Exposure Compensation
Center-Weighted Area
Self-Timer
c4
Monitor
File Number Sequence
Control Panel/
Information Display
Auto Bracketing
Multi Selector Center Button
Assign Preview Button
Customize Command Dials
Dial
No Memory Card
The Setup Menu:
Format Me

the Manage Picture Control
Manage Picture Control
the Manage Picture Control
Picture Controls
The Original Picture Control Icon
Picture Control
Picture Control
Original Picture Control
Picture Control Utility
Picture Controls
Picture Controls
Picture Controls
Picture Controls
Picture Controls C-1
Picture Controls
Picture Control
J.
Picture Controls
Picture Controls
Saving Custom Picture Controls
Picture Controls
Picture Controls
Picture Controls
Standard
D-Lighting
D-Lighting
D-Lighting
J.
D-Lighting Matrix
ISO
ISO
Custom Setting
J Color Space
Select Color
Highlight Color
J.
Option Description W
Adobe RGB
Adobe
ExifPrint
Adobe RGB
Adobe
DCF
DCF
ICC
TIFF
Adobe
Nikon Software ViewNX
the Nikon Creative Lighting System
Flash
The Nikon Creative Lighting System
TTL Flash Control
Flash Modes
the Nikon Creative Lighting System
CLS
SB-900
SB-400
The Sync Terminal A
The Nikon Creative Lighting System
Creative Lighting System
SU-800
Flash
ISO
ISO
SU-800
Speedlight
SB-900
SB-900
ISO
SB-800
ISO
SB-

D AE
AE
Flash
Fn
Dynamic AF
ADL
CL
Matrix
Matrix
Fn
K Playback
Fn
Fn
Fn
Fn button
Fn
Custom Setting
Fn
Camera
Control panel
Select Focus
CPU
Press
the Non-CPU
Press
Fn
Dynamic AF
Fn
Main
Fn
Preview +
Fn
Fn
Focus point
Preview +
AE
AE-L
AE-L
AE
Fn
AE
AE
AE
AE
Fn
AE
Focus-point
Assign Preview Button
➜ ACustom
➜ ACustom Settings
Assign BKT Button
➜ A Custom
Option Description t
Press
Press
Customize Command Dials
➜ A Custom
Option Description Reverse
Sub-command
Change
CPU
CPU
CPU
ISO
QUAL
Custom Setting c2
Enable
Select (W
Dial
➜ A Custom
➜ A Custom
➜ A Custom
U BThe Setup Menu
Battery
Time
Firmware
Format Memory Card G
U Acquire
the Image Dust Off
CPU
J.
G. • Start
D Image Sensor Cleaning Dust
U 2 Frame
D Image
HDMI
HDMI
HDMI
Option Description
AC
Flicker Reduction G
D Flicker
ISO
Time Zone
Option Description Time
Input
Input
Korean Image Comment
Camera
Auto Image Rotation
Camera
Camera
Rotate
U View
Battery Info
Item Description Battery
Battery
U Choose
Nikon
Image Authentication
D Cam

the Camera Storage Cleaning
Clean at Startup/
Manual Cleaning Replacing
 Data Science
the Command Line DATA /DATA SCIENCE Data Science
the Command Line ISBN
the Department of Applied Physics
Columbia University
The New York Times
the Data Science Toolbox
CSV
HTML
JSON ■ Explore
Maastricht University
Machine Learning
Tilburg University
Jeroen Janssens Data Science
Command Line
Line Janssens
the Department of Applied Physics
Columbia University
The New York Times
the Data Science Toolbox
CSV
HTML
JSON ■ Explore
Maastricht University
Machine Learning
Tilburg University
Jeroen Janssens Data Science
Command Line
Line Janssens
Janssens Data Science
the Command Line
] Data Science
Command
Jeroen Janssens Copyright
O’Reilly Media
1005 Gravenstein Highway North
CA 95472
corporate@oreilly.com
Jasmine Kwityn Indexer
Wendy Catalano Interior Designer
First Edition Revision History
O’Reilly Media
the Command Line
O’Reilly Media
Table of Contents Preface
2 Data Science
Command
7 The Command Line
7 Th

CSV
CSV
Scrubbing Data Performing
SQL
CSV
SQL
SQL
CSV
CSV
SQL
CSV
CSV
CSV
SQL
SQL
HTML
JSON
CSV
JSON
HTML
JSON
HTML
JSON
JSON
CSV
HTML
Janssens
Parmentier
Czebotar
HTML
HTML
HTML
HTML
UA-Compatible
Scrubbing Data
HTML
>Vatican City</td
HTML
JSON
CSS
HTML
<
XML
HTML
JSON
JSON
JSON
JSON
JSON
CSV
JSON
Wikipedia
CSV
JSON
HTML
XML
CSV
Common Scrub Operations
Iris
Scrubbing Data |
SQL
Iris
|---------------+--------------+-------------+--------------| Filtering Lines
CSV
CSV
Common Scrub Operations
| Sun | Dinner
| Sun |
| Sun |
USD
Dinner |
| Sun |
Sat |
| Sun | Dinner
| Sun |
| Sun |
Sun | Dinner
WHERE
SQL
SQL
CSV
CSV
BEGIN{OFS="
CSV
Rio (
Janssens
Vertical
CSV
CRUSH
CSV
Iris-setosa.csv
Iris-versicolor.csv
Iris-virginica.csv
Iris-setosa.csv
Iris-setosa
Iris-setosa
Iris-versicolor
Iris-versicolor
Iris-versicolor
Iris-setosa
Iris-setosa
Iris-versicolor
Iris-versicolor
Iris-versicolor
Iris-*.csv -n
|--------+--------------+-------------+--------------+--------------| |
Scrubbing Data Concatena

Artificial Intelligence
Maastricht University
Machine Learning
Tilburg University
Jeroen
the Brooklyn Bridge
http://jeroenjanssens.com
Data Science
the Command Line
URW Typewriter
Guardian
Adobe
Adobe Myriad Condensed
Table of Contents Preface
Overview Data
Why Data Science
The Command Line
The Command Line
The Command Line
The Command Line
The Command Line
Your Data Science Toolbox Step
Mac OS X
Start Anew Essential Concepts
Tools The Environment
Command-Line Tool
Command-Line Tools Combining Command-Line Tools
Input
the Data Science Toolbox Local Version of Data Science Toolbox Remote Version of Data Science Toolbox Decompressing Files
Microsoft Excel Spreadsheets
Shell Scripts Step
Define Shebang Step 4
Command-Line Tools
Shell Script Processing Streaming Data
Standard Input Further Reading
Scrubbing Data Overview Common Scrub Operations
SQL Queries
Project Gutenberg Every Workflow
That Depends Rebuilding Specific Targets Discussion Further
Exploring Data Overview Inspecting Data
It

the White Balance for Night Photo
White Balance for Burst
Profile GoPro Color
Flat
Native
ISO
ISO
ISO
ISO
ISO
ISO
ISO
ISO
ISO
ISO
Brighter
ISO
ISO
ISO
ISO
ISO
Time Lapse
ISO
ISO Minimum
ISO
Brighter
FPS
Auto
Exposure Control
ISO
ISO
the Exposure Value Compensation
ISO
Medium
Video
RAW
the Manual Audio Control
Manual Audio Control
AAC
GoPro’s
Option Description Standard
Standard+ Camera
GoPro
Front
Time Lapse Video
Video + Photo
Time Lapse Photo
Night Lapse Photo
QuikCapture
AUTO OFF Powers
Voice Control
Auto Image Rotation)
Camera
NTSC
PAL TV
the File Repair
MICROSD CARD MESSAGES
microSD
microSDXC
FULL
Time
GoPro Plus
Reset
FRAME
The Frame
The HERO5 Black
the Camera in
Auto-Rotation
The Curved + Flat Adhesive Mounts
The Frame
HDMI
• Capture
USB
USB
the GoPro Supercharger
the GoPro Portable Power Pack
BATTERY INFORMATION BATTERY STORAGE AND
the Rechargeable Battery Recycling Corporation’s
Battery Recycling Program
GoPro
USB
CHOPPY Choppy
HD
MSC
the Important Product + Safety
GoPro
HERO


Command– Down Arrow
Control–
Arrow,
Arrow,
Apple Wireless Keyboard
VoiceOver
Command
Command
Command
VoiceOver
Settings > General
VoiceOver > Braille
Adjust Braille
VoiceOver > Braille
Unified English
Voice Control
Settings > International > Language
> Voice Control
VoiceOver
iPhone
VoiceOver > Braille
VoiceOver
Siri
Safari
the Accessibility Shortcut
Apple Wireless Keyboard
Apple Wireless Keyboard
Invert Colors
Invert Colors
Bold Text
Increase Contrast
iPhone
the Accessibility Shortcut
Hearing Aid Control
Lock Screen
Control Live Listen
iPhone
Hearing Aids
Siri, Music
FCC
ANSI
FCC
M4
Apple
Subtitles & Captioning > Closed Captions
SDH
Subtitles & Captioning > Style
Background
Appendix A
LED Flash for
LED
LED Flash
LED Flash
Mono Audio
Mono Audio
Mono Audio
Facebook Post
the iTunes Store
iTunes Store
iPhone
Appendix A
Guided Access
Guided Access
Enable the Sleep
Guided Access
Switch Control Switch Control
Switch Control
Bluetooth
The iPhone FaceTime
Accessibility > Switch Control
Accessi

textFile
RDD
RDD
RDD
HDFS
Spark
Spark
Spark
Spark’s
RDD
RDD
RDD
Spark
README
Interactive Python Shell
Spark
RDD
SparkContext’s
RDD Operations
RDD
RDD
JavaRDD<String
Function<String
Boolean call(String
RDD
RDD Operations
inputRDD
Spark
RDD
RDD
RDD
Input
Scala
RDD
HDFS
RDD
Spark
Haskell
LINQ
RDD Operations
RDD
RDD
MapReduce
Spark Most of Spark
Spark
Spark
Java’s
Scala
Usage Function
T2
RDD<
Function<String
Boolean call(String
ContainsError
Function<String
Boolean call(String
RDD<
Function<String
String
Boolean call(String
RDD<
Oracle’s documen‐
Spark
RDD
RDD
RDD
RDD String
RDD
RDD
RDD
sc.parallelize(List(1
RDD
FlatMapFunction<String
Iterable<String
RDD Pseudo
RDD
RDD
RDD
RDD
Cartesian
RDD
RDD
RDD
RDD
RDD
RDD
RDD
Integer>
RDD
RDD
RDD
RDD
Function2<AvgCount,
AvgCount>
Function2<AvgCount,
AvgCount>
AvgCount
AvgCount>
AvgCount
AvgCount
AvgCount(0
RDD
RDD
RDD
RDD
JSON
RDD
RDD
RDD
RDD
Purpose Example Result take(num
RDD
RDD
9 fold(zero)(func
9 aggregate(zeroValue
RDD
RDD
SparkContext
SparkCont

ZooKeeper
RDD
Receiver Fault Tolerance
Flume
Spark
Spark
| Chapter
Twit‐
Spark
HDFS
HDFS
Spark Streaming’s
RDD
Spark Streaming’s saveAs
UI
Spark UI
UI
UI
Spark, Spark Streaming
| Chapter
UI
Receivers
Garbage Collection
Java’s
Sweep
Enable the
Performance Considerations
GC
Spark Streaming
Spark
LRU
Spark
Machine Learning
MLlib
Spark’s
MLlib
MLlib’s
K- means||
MLlib’s
MLlib
API
API
API
NumPy
Anaconda
http://bit.ly/1yCoMIC Machine Learning Basics
’s
Machine Learning Basics
HashingTF
normal.txt
’s Git
LabeledPoint
LabeledPoint
Cache
Logistic Regression
SGD
LabeledPoint
Cache
Logistic Regression
SGD
LogisticRegressionWithSGD().run(trainingData
Spark
Machine Learning Basics
JavaRDD<
JavaRDD<
LabeledPoint
LabeledPoint
Function<String
LabeledPoint
LabeledPoint(1
LabeledPoint
Function<String
LabeledPoint
LabeledPoint(0
JavaRDD<
Cache
Logistic Regression
SGD
Vector
Vector
Spark
Vector
LabeledPoint
LabeledPoints
Spark SQL
MLlib
Vector
Vectors Data Types
MLlib denseVec2
Vectors
Vectors.dense
Vecto

the Software License Confirmation
The Machine ID
MSAB License Management 9
Activate
Open
the License Activation
Open
Activate
MSAB
MSAB License Management
Update a Alphanumeric License Key
Update
MSAB
Click Update
Update
Activate
MSAB
MSAB License Management
Update a Alphanumeric License Key
MSAB
MSAB
the License Request
Click Update
USB
Update
MSAB License Management
MSAB
MSAB
Upload License Request
the License Request
Open
MSAB
the Software License Confirmation
MSAB
Manual
the Software License Confirmation
The Machine ID
MSAB License Management
Update
Open
the License Activation
Open
Activate
MSAB
MSAB License Management
Deactivate Alphanumeric License Key
Deactivate Alphanumeric License Key
MSAB
Confirm
MSAB
MSAB License Management
Deactivate Alphanumeric License Key
The License Deactivation
MSAB
USB
Deactivate
Create
MSAB License Management
MSAB
MSAB
Upload License Deactivation
the License Deactivation
Open
MSAB
MSAB
MSAB License Management
The CodeMeter CmStick
USB
CmStick
XRY
Cod

Coursework II Problem
Conventionally
j]>0.5
c_1
AM
HTML
ODT
Microsoft Office
DOCX
PPTX
Times New Roman
Moodle
Turnitin
the Kaggle Social Network Challenge
Order Confirmation Order
Amazon Marketplace
BOOKMART & GAMEXCHANGE
1060 Austria Order Details Order
Judicial Review Condition
BOOKMART & GAMEXCHANGE
Delivery & Handling
Amazon EU Sarl
Value Added
VAT
XHTML
Synchronize Ctrl+Alt+Y Quick Switch Scheme
Last Tool Window
All Tool Windows Shift+Esc
a Tool Window
Caret Ctrl+F3
Symbol Ctrl+Alt+Shift+N NAVIGATE
SYMBOLS Declaration
JavaScript
Ctrl+Shift+B Super Method Ctrl+U Implementation(s
CONTEXT Select In
Favorites Alt+Shift+F
Mnemonic Ctrl+F11
IDE
Cursor Alt+F9 Force Run
Show Execution
Ctrl+F2 Stop Background Processes
Ctrl+Shift+F2 Resume Program F9
Toggle Temporary Line
Shift+F1 Generate Code
Ctrl+Shift+Down Delete Line
Caret Ctrl+Y Join
Shift+F2 Run Inspection
VCS Operations Popup
Update Project Ctrl+T Recent Changes Alt+Shift+C Revert
⌃⌘F Quick Switch Scheme
Last Tool Window
All Tool W

KeyboardInterrupt: 