# Correcting Validity Example
This is an example to show how to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- Check if the field "productionStartYear" contains a year.
- Check if the year is in range 1886-2014.
- Convert the value of the field to be just a year (not full datetime).
- The rest of the fields and values stay the same.
- If the value of the field is a valid year in the range as described above, write that line to the output_good file.
- If the value of the field is not a valid year as described above, write that line to the output_bad file.
- Discard rows (neither write to good nor bad) if the URI is not from dbpedia.org.
- Use the provided way of reading and writing data (DictReader and DictWriter). They will take care of dealing with the header.

In [1]:
import csv
import pprint
import re
from zipfile import ZipFile
import io

fname = 'autos.zip'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'

# Open the zipped file.
with ZipFile(fname, 'r') as zfile:
    INPUT_FILE = io.TextIOWrapper(zfile.open('autos.csv'))

def validate_uri(row):
    return re.match(r'http:\/\/dbpedia.org', row)

def validate_date(row):
    match = re.match(r'\d{4}', row)
    if match:
        if (int(match.group()) >= 1886) and (int(match.group()) <= 2014):
            return int(match.group())    

def process_file(input_file, output_good, output_bad):
    # store data into lists for output
    data_good = []
    data_bad = []
    reader = csv.DictReader(input_file)
    header = reader.fieldnames
    for row in reader:
        # validate URI value
        if validate_uri(row['URI']):
            new_date = validate_date(row['productionStartYear'])
            if (new_date):
                row['productionStartYear'] = new_date
                data_good.append(row)
            else:
                data_bad.append(row)
 
    # Write processed data to output files
    with open(output_good, "w") as good:
        writer = csv.DictWriter(good, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_good:
            writer.writerow(row)

    with open(output_bad, "w") as bad:
        writer = csv.DictWriter(bad, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_bad:
            writer.writerow(row)

if False:
    process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)