# Data Wrangling

Dealing with and or converting missing or ill-formated data into a format that more easily lends itself to analysis

## Imports

In [2]:
import numpy as np
import pandas as pd
import os
import re
from translate import Translator

## Definitions

In [3]:
def input_dir():
    return os.getcwd() + '/data/input/'

def output_dir():
    return os.getcwd() + '/data/'

In [4]:
def transl(lang, word):
    translator = Translator(from_lang='pt',to_lang=lang)
    return translator.translate(word)

In [20]:
base = 'sp-'
filetype_in = '.csv'
filetype_out = '.csv'
table_attributes_def = ['species','region', 'site', 'longitude', 'latitude', 'ref']

In [21]:
def wrangler(species):
    # create output file
    out_data = open(output_dir() + 'wrangled_all_species' + filetype_out, 'w')
    # add new table attribute definitions
    out_data.write(','.join(table_attributes_def))
    out_data.write('\n')

    for sp in species:
        dros_data = open(input_dir() + base + sp + filetype_in, 'r+')
        # region dictionary
        region_dict = {}
        with open(input_dir() + base + sp + filetype_in) as dros_data:
            # ignore first line
            next(dros_data)
            for line in dros_data:
                # line to list
                line = line.split(';')
                # remove 9 last entries (garbage)
                line = line[:-9]
                # Remove reference (article) comments 
                line[-1] = line[-1].split(',')[0]
                # Translate region (pt to en)
                if not region_dict.get(line[0]):
                    # save translation in our dictionary - translate function is REALLY slow
                    region_dict[line[0]] = transl('en',line[0])
                line[0] = region_dict[line[0]]
                # Remove \xa0 and leading whitespace
                line[1] = line[1].replace(u'\xa0\xa0\xa0', '')
                line[1] = line[1].lstrip()
                # Set float notation (using dot instead of comma)
                line[2] = line[2].replace(u',','.')
                line[3] = line[3].replace(u',','.')
                # Replace '-' 
                line[:] = [x if x != '-' else '' for x in line]
                line = [sp] + line

                out_data.write(','.join(line))
                out_data.write('\n')


## Data Wrangling

In [None]:
species = ['cardini','cardinoides','neocardini','neomorpha','polymorpha','simulans','willistoni']
wrangler(species)

# Intro to Computer Science 2017/1 Data

## Open Wrangled .csv Data with Pandas

## Print DataFrame

## DataFrame Info

## Fill NaN

## DataFrame to .csv

# Intro to Computer Science 2017/2 Data

## Open Wrangled .csv Data with Pandas

## Print DataFrame

## DataFrame Info

## Fill NaN

## DataFrame to .csv