In [64]:
from pyspark.sql.functions import concat_ws, column, udf, explode
import pyspark.sql.types as types

In [7]:
idb_df_version = "20161119"

In [8]:
idb_df = sqlContext.read.parquet("/guoda/data/idigbio-{0}-100k.parquet".format(idb_df_version))
idb_df.count()

100000

In [15]:
idb_df.printSchema()

root
 |-- barcodevalue: string (nullable = true)
 |-- basisofrecord: string (nullable = true)
 |-- bed: string (nullable = true)
 |-- canonicalname: string (nullable = true)
 |-- catalognumber: string (nullable = true)
 |-- class: string (nullable = true)
 |-- collectioncode: string (nullable = true)
 |-- collectionid: string (nullable = true)
 |-- collectionname: string (nullable = true)
 |-- collector: string (nullable = true)
 |-- commonname: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- coordinateuncertainty: float (nullable = true)
 |-- country: string (nullable = true)
 |-- countrycode: string (nullable = true)
 |-- county: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- coreid: string (nullable = true)
 |    |-- dc:rights: string (nullable = true)
 |    |-- dcterms:accessRights: string (nullable = true)
 |    |-- dcterms:bibliographicCitation: string (nullable = true)
 |    |-- dcterms:language: string (nullable = true)
 |    |-- d

In [65]:
notes = (idb_df
         .select(concat_ws(" ", idb_df["data.dwc:occurrenceRemarks"],
                        idb_df["data.dwc:eventRemarks"],
                        idb_df["data.dwc:fieldNotes"]
                        )
                        .alias("note"),
                 idb_df["uuid"]
                 )
         .where(column("note") != "")
         )

In [56]:
print(notes.head())
print(notes.count())

Row(note='On sandy soil over rocks.', uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a')
27667


In [81]:
import nltk
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('english'))
t = nltk.tokenize.treebank.TreebankWordTokenizer() 

def tokenize(s):
    '''
    Take a string and return a list of tokens split out from it
    with the nltk library
    '''

    # word_tokenize uses PunktSentenceTokenizer first, then
    # treebank_word_tokenizer on those so can get nested
    # lists.
    #return nltk.tokenize.word_tokenize(s)

    # this is just the treebank tokenizer
    return [word for word in t.tokenize(s) if word not in stopwords_set]

udf_tokenize = udf(tokenize, types.ArrayType(types.StringType()))

print(tokenize('a-uuid-with-dashes Some isn\'t "99", nor is it good; only bad. 3kas9203h: CAT32423432'))

['a-uuid-with-dashes', 'Some', "n't", '``', '99', "''", ',', 'good', ';', 'bad.', '3kas9203h', ':', 'CAT32423432']


In [91]:
tokens = (notes
         .withColumn("tokens", udf_tokenize(notes["note"]))
         .select(notes["uuid"],
                 explode(column("tokens")).alias("token")
                 )
          )

In [92]:
print(tokens.head(5))
print(tokens.count())

[Row(uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a', token='On'), Row(uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a', token='sandy'), Row(uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a', token='soil'), Row(uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a', token='rocks'), Row(uuid='b73cb633-3eb9-43a1-a0c6-08b07805699a', token='.')]
263228


In [93]:
idb_tf_index = (tokens
               .groupBy(tokens["uuid"], tokens["token"])
               .count()
               )

In [94]:
print(idb_tf_index.head(10))
print(idb_tf_index.count())

[Row(uuid='b73d735c-43b4-4a45-85af-4a684ceb4e8c', token='marshy', count=1), Row(uuid='b73f5953-6c64-40fb-a056-f407f88605c5', token='Moist', count=1), Row(uuid='b74692e2-73f8-4b13-93e8-a5a5d3b7e67f', token='menziesii', count=1), Row(uuid='b748a569-e93e-403b-a59c-1538adee6c22', token='2', count=1), Row(uuid='b748a569-e93e-403b-a59c-1538adee6c22', token='5.5', count=1), Row(uuid='b74a0670-ff75-4dfc-aa04-97e7774936fa', token='anthers', count=1), Row(uuid='b74a8cee-d5a6-4c57-9634-928be0c74ee5', token='Tech', count=1), Row(uuid='b74b595d-efb3-4c92-9f3f-5ac6b8b47fd0', token=';', count=1), Row(uuid='b74dfbc8-af96-47b0-9c62-1d2fabbc1a96', token='girgensonhnii', count=1), Row(uuid='b74f4dec-ac3c-4bc5-bbe4-aabd3d34e440', token='along', count=1)]
232306
