In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it,
clean it, come up with a data model, insert it into MongoDB and then run some
queries against your database. The set contains data about Arachnid class
animals.

Your task in this exercise is to parse the file, process only the fields that
are listed in the FIELDS dictionary as keys, and return a list of dictionaries
of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label'
  field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the
  same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the
  cleanup is up to you, e.g. removing "*" prefixes etc. If there is a singular
  synonym, the value should still be formatted in a list.
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:

[ { 'label': 'Argiope',
    'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
    'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
    'name': 'Argiope',
    'synonym': ["One", "Two"],
    'classification': {
                      'family': 'Orb-weaver spider',
                      'class': 'Arachnid',
                      'phylum': 'Arthropod',
                      'order': 'Spider',
                      'kingdom': 'Animal',
                      'genus': None
                      }
  },
  { 'label': ... , }, ...
]

  * Note that the value associated with the classification key is a dictionary
    with taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}


In [8]:
def change_keys(d):
    for key in FIELDS.keys():
        d[FIELDS[key]] = d.pop(key)

In [19]:
re.findall(string="Argiope (spider)", pattern=r"(.*)\(.*\)")[0]

['Argiope ']

In [28]:
re.findall(string= "Tick", pattern=r"(.*)\(.*\)") == []

True

In [76]:
#   rdf-schema#label
with open(DATAFILE, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()

    for line in reader:
        if line.has_key('classification'):
            print line['classification']

In [71]:
snames = ['* Hydracarina', '* Hydrachnellae', '* Hydrachnidia']
new_snames = []
for sname in snames:
    if sname[0] == '*':
        new_snames.append(sname[1:].lstrip())
snames = new_snames
snames

['Hydracarina', 'Hydrachnellae', 'Hydrachnidia']

In [72]:
process_fields = FIELDS.keys()
data = []
with open(DATAFILE, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()

    for line in reader:
        # 1. change key
        change_keys(line)
        
        # 2. trim out redundant description in parenthesis
        pattern = re.findall(string= line['label'], pattern=r"(.*)\(.*\)")
        if pattern != []:
            line['label'] = pattern[0].strip()
        
        # 3. reset name if it's none
        if line['name'] == "NULL" or re.match('^[\w-]+$', line['name']) is None:
            line['name'] = line['label']
        
        # 4. convert "NULL" to None
        for key in FIELDS.values():
            if line[key] == "NULL":
                line[key] = None
            
                
        # 5. synonym is not None, convert to list
        if line['synonym'] is not None:
            snames_str = re.findall('\{(.*)\}', line['synonym'])
            if snames_str == []:
                line['synonym'] = [line['synonym']]
            else:
                # line['synonym'] = snames_str[0].split('|')
                snames = snames_str[0].split('|')
                new_snames = []
                for sname in snames:
                    if sname[0] == '*':
                        new_snames.append(sname[1:].lstrip())
                    else:
                        new_snames.append(sname.lstrip())
                line['synonym'] = new_snames

        data.append(line)

In [74]:
for line in data:
    print line['classification']

KeyError: 'classification'

In [61]:
'* Heterometrus (Chersonesometrus) Couzijn 1978'[1:].lstrip()

'Heterometrus (Chersonesometrus) Couzijn 1978'

In [86]:
def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()

        for line in reader:
            # 1. change key
            change_keys(line)

            # 2. trim out redundant description in parenthesis
            pattern = re.findall(string= line['label'], pattern=r"(.*)\(.*\)")
            if pattern != []:
                line['label'] = pattern[0].strip()

            # 3. reset name if it's none
            if line['name'] == "NULL" or re.match('^[\w-]+$', line['name']) is None:
                line['name'] = line['label']

            # 4. convert "NULL" to None
            for key in FIELDS.values():
                if line[key] == "NULL":
                    line[key] = None


            # 5. synonym is not None, convert to list
            if line['synonym'] is not None:
                snames_str = re.findall('\{(.*)\}', line['synonym'])
                if snames_str == []:
                    line['synonym'] = [line['synonym']]
                else:
                    # line['synonym'] = snames_str[0].split('|')
                    snames = snames_str[0].split('|')
                    new_snames = []
                    for sname in snames:
                        if sname[0] == '*':
                            new_snames.append(sname[1:].lstrip())
                        else:
                            new_snames.append(sname.lstrip())
                    line['synonym'] = new_snames
            new_dic = {}
            classification_keys = ["kingdom", "family","order","phylum","genus","class"]
            for old_key, value in line.items():
                class_d = {}
                if old_key in FIELDS.values():
                    if old_key not in classification_keys:
                        new_dic[old_key] = value
                    else:
                        for c_key in classification_keys:
                            class_d[c_key] = line[c_key]
                        new_dic['classification'] = class_d
            
            data.append(new_dic)
            
    return data

In [87]:
def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]

In [92]:

data = process_file(DATAFILE, FIELDS)
print "Your first entry:"
pprint.pprint(data[0])
first_entry = {
    "synonym": None, 
    "name": "Argiope", 
    "classification": {
        "kingdom": "Animal", 
        "family": "Orb-weaver spider", 
        "order": "Spider", 
        "phylum": "Arthropod", 
        "genus": None, 
        "class": "Arachnid"
    }, 
    "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
    "label": "Argiope", 
    "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
}

len(data)
data[0]
assert len(data) == 76
assert data[0] == first_entry
assert data[17]["name"] == "Ogdenia"
assert data[48]["label"] == "Hydrachnidiae"
assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]



Your first entry:
{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}
