In [1]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in the range as described above,
  write that line to the output_good file
- if the value of the field is not a valid year as described above, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""
import csv
import pprint

INPUT_FILE = 'autos.csv'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'



In [2]:
data = []
with open(INPUT_FILE, "r") as f:
    reader = csv.DictReader(f)
    header = reader.fieldnames
    for line in reader:
        data.append(line)

In [3]:
data[:10]

[{'22-rdf-syntax-ns#type': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  '22-rdf-syntax-ns#type_label': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'URI': 'URI',
  'assembly': 'http://dbpedia.org/ontology/assembly',
  'assembly_label': 'http://dbpedia.org/ontology/assembly',
  'automobilePlatform': 'http://dbpedia.org/ontology/automobilePlatform',
  'automobilePlatform_label': 'http://dbpedia.org/ontology/automobilePlatform',
  'bodyStyle': 'http://dbpedia.org/ontology/bodyStyle',
  'bodyStyle_label': 'http://dbpedia.org/ontology/bodyStyle',
  'class': 'http://dbpedia.org/ontology/class',
  'class_label': 'http://dbpedia.org/ontology/class',
  'depiction': 'http://xmlns.com/foaf/0.1/depiction',
  'depiction_label': 'http://xmlns.com/foaf/0.1/depiction',
  'designCompany': 'http://dbpedia.org/ontology/designCompany',
  'designCompany_label': 'http://dbpedia.org/ontology/designCompany',
  'designer': 'http://dbpedia.org/ontology/designer',
  'designer_label': 'http://dbpe

In [9]:
import re

In [15]:
year_lines = [item['productionStartYear'] for item in data]
year_type_re = re.compile(r'^[0-9]+',)

In [16]:
for line in year_lines:
    if year_type_re.search(line) is not None:
        print line[:4]

1964
1989
1977
1976
1982
1992
1965
1948
1996
1997
1957
1946
1969
1969
1970
1947
1957
1938
1959
1977
1959
1957
2108
1936
0050
1948
1964
1970
1953
1971
1991
2002
2002
0014
1908
1985
1939
1970
1957
1969
1958
1974
1996
1939
1958
1974
2001
1990
1975
1983
1953
1962
1964
1970
0001
1964
1975
1973
1981
1966
1979
1961
1975
1953
1956
1978
1978
1908
2003
1969
1955
1991
2002
1996
1995
1955
1994
1974
1955
1998
1927
1951
1998
1979
1960
1996
1955
2003
1983
2002
1978
1985
1968
1970
1968
1979
1975
1971
1984
1994
1959
1956
1994
1968
1957
1977
1983
1972
2001
1984
1968
1999
1978
1959
1997
1960
1956
1949
1943
1956
1938
1973
1947
1975
1966
1963
1997
1958
1959
1986
1932
1976
1970
1969
1984
1983
1980
1992
1960
1975
1984
1991
1988
1975
1985
1995
1975
1985
1987
1967
1983
1950
1955
1955
1968
2004
1967
1953
1958
1966
1985
2000
1961
1986
1972
1953
1982
1937
1966
1965
1949
1993
1954
1968
2001
1996
2004
1998
2001
1998
1952
1992
2002
1986
1978
1970
1997
2004
2004
2005
1933
1961
1935
1996
1953
1978
1978
2002
1987
2010


In [20]:
for line in data:
    print re.search("dbpedia",line['22-rdf-syntax-ns#type'])

None
None
None
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_Match object at 0x109a39d30>
<_sre.SRE_

In [21]:
def from_dbpedia(line):
    return re.search("dbpedia",line['22-rdf-syntax-ns#type'])

In [25]:
def has_a_year(string):
    year_type_re = re.compile(r'^[0-9]+')
    return year_type_re.search(string)

None


In [22]:
def year_btw_1886_2014(year):
    return 1886 <= int(year) <= 2014

In [39]:
def process_file(input_file, output_good, output_bad):
    
    raw_data = []
    good_data = []
    bad_data = []
    
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for item in reader:
            raw_data.append(item)
    
    for item in raw_data:
        if not from_dbpedia(item):
            continue
        elif has_a_year(item['productionStartYear']):
            year = item['productionStartYear'][:4]
            if year_btw_1886_2014(year):
                item['productionStartYear'] = year
                good_data.append(item)
            else:
                bad_data.append(item)
        else:
            bad_data.append(item)
    
    # return good_data,bad_data
  
    with open(output_good, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in good_data:
            writer.writerow(row)
            
    with open(output_bad, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in bad_data:
            writer.writerow(row)


In [40]:
process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)

In [None]:
# Another version

'''
def process_file(input_file, output_good, output_bad):
    # store data into lists for output
    data_good = []
    data_bad = []
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for row in reader:
            # validate URI value
            if row['URI'].find("dbpedia.org") < 0:
                continue

            ps_year = row['productionStartYear'][:4]
            try: # use try/except to filter valid items
                ps_year = int(ps_year)
                row['productionStartYear'] = ps_year
                if (ps_year >= 1886) and (ps_year <= 2014):
                    data_good.append(row)
                else:
                    data_bad.append(row)
            except ValueError: # non-numeric strings caught by exception
                if ps_year == 'NULL':
                    data_bad.append(row)

    # Write processed data to output files
    with open(output_good, "w") as good:
        writer = csv.DictWriter(good, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_good:
            writer.writerow(row)

    with open(output_bad, "w") as bad:
        writer = csv.DictWriter(bad, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_bad:
            writer.writerow(row)


'''