# Data Wrangling

Dealing with and or converting missing or ill-formated data into a format that more easily lends itself to analysis

## Imports

In [78]:
import numpy as np
import pandas as pd
import os
import re
from translate import Translator

## Definitions

In [79]:
def input_dir():
    return os.getcwd() + '/data/input/'

def output_dir():
    return os.getcwd() + '/data/'

In [109]:
def transl(lang, word):
    translator = Translator(from_lang='pt',to_lang=lang)
    return translator.translate(word)

In [110]:
base = 'sp-'
filetype_in = '.csv'
filetype_out = '.csv'
table_attributes_def = ['region', 'site', 'longitude', 'latitude', 'ref']

In [117]:
def wrangler(data_id):
    dros_data = open(input_dir() + base + data_id + filetype_in, 'r+')
    wrangled_data = {} # dict lists of lists
    # create output file
    out_data = open(output_dir() + 'wrangled_' + base + data_id + filetype_out, 'w')
    # add new table attribute definitions
    out_data.write(','.join(table_attributes_def))
    out_data.write('\n')
    # region dictionary
    region_dict = {}
    with open(input_dir() + base + data_id + filetype_in) as dros_data:
        # ignore first line
        next(dros_data)
        for line in dros_data:
            # line to list
            line = line.split(';')
            # remove 9 last entries (garbage)
            line = line[:-9]
            # Remove reference (article) comments 
            line[-1] = line[-1].split(',')[0]
            # Translate region (pt to en)
            if not region_dict.get(line[0]):
                # save translation in our dictionary - translate function is REALLY slow
                region_dict[line[0]] = transl('en',line[0])
            line[0] = region_dict[line[0]]
            # Remove \xa0 and leading whitespace
            line[1] = line[1].replace(u'\xa0\xa0\xa0', '')
            line[1] = line[1].lstrip()
            # Set float notation (using dot instead of comma)
            line[2] = line[2].replace(u',','.')
            line[3] = line[3].replace(u',','.')
            
            print(line)
            out_data.write(','.join(line))
            out_data.write('\n')

    #             # transform student_id to year
    #             line[0] = '20' + line[0][:2]
    #             # replace ',' with '.' for all line elements 
    #             line = [element.replace(',', '.') for element in line]
    #             line[-1] = line[-1][:2]
    #             print (line)
    #             line.append(current_class)
    #             wrangled_data[current_class].append(line)

    #     # create output file
    #     out_data = open(output_dir() + 'wrangled_' + base + data_id + filetype_out, 'w')
    #     # add new table attribute definitions
    #     out_data.write(','.join(table_attributes_def))
    #     out_data.write('\n')
    #     for key, species in wrangled_data.items():
    #         for occurrence in species:
    #             out_data.write(','.join(occurrence))
    #             out_data.write('\n')
    #     out_data.close()

## Data Wrangling

In [None]:
wrangler('cardini')

['South America', 'tavares', '-48.32', '27.39 ', 'Schmitz H.J. et al. (2007)']
['South America', 'santa marta/colombia', ' -74.10 ', '11.18', 'Hoenigsberg H.F. (1995)']
['South America', 'caracas', '-66.56', '10.35', 'Heed W.B. and Russell J.S. (1971)']
['South America', 'duaca', '-69.08', '10.22', 'Oelshlegel F.J. and Brewer G.J. (1975)']
['South America', 'barquisimeto', '-69.18', '10.03', 'Hunter A.S. (1970)']
['South America', 'carpentaro', ' -70.00', '10.00 ', 'Yoon J.S. (1984)']
['South America', 'carpentaro', ' -70.00', '10.00 ', 'Yoon Z. (1996)']
['South America', 'carpentaro', ' -70.00', '10.00 ', 'Yoon J.S. (1984)']
['South America', 'georgetown/guyana', '-58.10', '6.46', 'Heed W.B. and Russell J.S. (1971)']
['South America', 'bogota', '-74.05', '4.48', 'Heed W.B. and Russell J.S. (1971)']
['South America', 'bogota', '-74.05', '4.48', 'Hunter A.S. (1966)']
['South America', 'fusagasuga', '-74.21', '4.22', 'Hunter A.S. and Navarro A. (1969)']
['South America', 'orellana', '-76

['North America', 'honolulu', '-157.5', '21.19', ' Herforth R.S. et al. (1984)']
['North America', 'hawaii', '-155.4', '19.55', ' Anonymous XXX. (1986)']
['North America', 'hawaii', '-155.4', '19.55', ' Kaneshiro K. (1986)']
['North America', 'hawaii', '-155.4', '19.55', ' Leblanc L. et al. (2109)']
['North America', 'monterey', '-120.3', '34.3', ' Stalker H.D. (1953)']
['North America', 'ensenada/bcn', '-116.37', '32.52', ' Heed W.B. and Russell J.S. (1971)']
['North America', 'arizona', '-112', '35', ' Patterson J.T. (1943)']
['North America', 'chiricahua mountains/az', '-109.23', '31.56', ' Patterson J.T. and Wagner R.P. (1943)']
['North America', 'chinipas', '-108.32', '27.22', ' Heed W.B. and Russell J.S. (1971)']
['North America', 'culiacan/sin', '-107.27', '24.27', ' Gallo A.J. and Salceda V.M. (1974)']
['North America', 'san rafael oribo', '-106', '30', ' Heed W.B. and Russell J.S. (1971)']
['North America', 'durango/dur', '-104.4', '24.01', ' Stalker H.D. (1953)']
['North Amer

# Intro to Computer Science 2017/1 Data

## Open Wrangled .csv Data with Pandas

## Print DataFrame

## DataFrame Info

## Fill NaN

## DataFrame to .csv

# Intro to Computer Science 2017/2 Data

## Open Wrangled .csv Data with Pandas

## Print DataFrame

## DataFrame Info

## Fill NaN

## DataFrame to .csv