# Data Processing (Vanilla Python)

| street | city | ... |
|--------|------|-----|
| HIGH ST | SACRAMENTO | ... |
| LOW ST | SACRAMENTO | ... |
| ... | ... | ... |

In [1]:
{"street": ["HIGH ST", "LOW ST"],
 "city": ["SACRAMENTO", "SACRAMENTO"]}

{'street': ['HIGH ST', 'LOW ST'], 'city': ['SACRAMENTO', 'SACRAMENTO']}

In [2]:
{1: ["HIGH ST", "SACRAMENTO"],
 2: ["LOW ST", "SACRAMENTO"]}

{1: ['HIGH ST', 'SACRAMENTO'], 2: ['LOW ST', 'SACRAMENTO']}

In [3]:
{0: {"street": "HIGH ST",
       "city": "SACRAMENTO"},
 1: {"street": "LOW ST",
       "city": "SACRAMENTO"}}

{0: {'street': 'HIGH ST', 'city': 'SACRAMENTO'},
 1: {'street': 'LOW ST', 'city': 'SACRAMENTO'}}

In [4]:
header = ["street", "city"]
data = [["HIGH ST", "SACRAMENTO"],
        ["LOW ST", "SACRAMENTO"]]

All of these are possible.

We will use a `list` of `dict`s

In [5]:
data = [{"street": "HIGH ST",
         "city": "SACRAMENTO"},
        {"street": "LOW ST",
         "city": "SACRAMENTO"}]

In [6]:
data

[{'street': 'HIGH ST', 'city': 'SACRAMENTO'},
 {'street': 'LOW ST', 'city': 'SACRAMENTO'}]

# 1. Read the data

In [7]:
import csv
with open("data/sacramento_data.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    
    for i, row in enumerate(reader):
        print(f"This is row number {i + 1}")
        print("========================")
        print(row)

This is row number 1
{'street': '3526 HIGH ST', 'city': 'SACRAMENTO', 'zip': '95838', 'state': 'CA', 'beds': '2', 'baths': '1', 'sq__ft': '836', 'type': 'Residential', 'sale_date': 'Wed May 21 00:00:00 EDT 2008', 'price': '59222', 'latitude': '38.631913', 'longitude': '-121.434879'}
This is row number 2
{'street': '51 OMAHA CT', 'city': 'SACRAMENTO', 'zip': '95823', 'state': 'CA', 'beds': '3', 'baths': '1', 'sq__ft': '1167', 'type': 'Residential', 'sale_date': 'Wed May 21 00:00:00 EDT 2008', 'price': '68212', 'latitude': '38.478902', 'longitude': '-121.431028'}
This is row number 3
{'street': '2796 BRANCH ST', 'city': 'SACRAMENTO', 'zip': '95815', 'state': 'CA', 'beds': '2', 'baths': '1', 'sq__ft': '796', 'type': 'Residential', 'sale_date': 'Wed May 21 00:00:00 EDT 2008', 'price': '68880', 'latitude': '38.618305', 'longitude': '-121.443839'}
This is row number 4
{'street': '2805 JANETTE WAY', 'city': 'SACRAMENTO', 'zip': '95815', 'state': 'CA', 'beds': '2', 'baths': '1', 'sq__ft': '852

This is row number 315
{'street': '1044 GALSTON DR', 'city': 'FOLSOM', 'zip': '95630', 'state': 'CA', 'beds': '4', 'baths': '2', 'sq__ft': '2581', 'type': 'Residential', 'sale_date': 'Tue May 20 00:00:00 EDT 2008', 'price': '450000', 'latitude': '38.676306', 'longitude': '-121.09954'}
This is row number 316
{'street': '4440 SYCAMORE AVE', 'city': 'SACRAMENTO', 'zip': '95841', 'state': 'CA', 'beds': '3', 'baths': '1', 'sq__ft': '2068', 'type': 'Residential', 'sale_date': 'Tue May 20 00:00:00 EDT 2008', 'price': '460000', 'latitude': '38.646374', 'longitude': '-121.353658'}
This is row number 317
{'street': '1032 SOUZA DR', 'city': 'EL DORADO HILLS', 'zip': '95762', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft': '0', 'type': 'Residential', 'sale_date': 'Tue May 20 00:00:00 EDT 2008', 'price': '460000', 'latitude': '38.668239', 'longitude': '-121.064437'}
This is row number 318
{'street': '9760 LAZULITE CT', 'city': 'ELK GROVE', 'zip': '95624', 'state': 'CA', 'beds': '4', 'baths': '3

This is row number 686
{'street': '4290 BLACKFORD WAY', 'city': 'SACRAMENTO', 'zip': '95823', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft': '1513', 'type': 'Residential', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '193500', 'latitude': '38.470494', 'longitude': '-121.454162'}
This is row number 687
{'street': '5890 TT TRAK', 'city': 'FORESTHILL', 'zip': '95631', 'state': 'CA', 'beds': '0', 'baths': '0', 'sq__ft': '0', 'type': 'Residential', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '194818', 'latitude': '39.020808', 'longitude': '-120.821518'}
This is row number 688
{'street': '7015 WOODSIDE DR', 'city': 'SACRAMENTO', 'zip': '95842', 'state': 'CA', 'beds': '4', 'baths': '2', 'sq__ft': '1578', 'type': 'Residential', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '195000', 'latitude': '38.693071', 'longitude': '-121.332365'}
This is row number 689
{'street': '6019 CHESHIRE WAY', 'city': 'CITRUS HEIGHTS', 'zip': '95610', 'state': 'CA', 'beds': '4', 'bat

{'street': '6709 ROSE BRIDGE DR', 'city': 'ROSEVILLE', 'zip': '95678', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft': '2172', 'type': 'Residential', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '350000', 'latitude': '38.792461', 'longitude': '-121.275711'}
This is row number 815
{'street': '281 SPYGLASS HL', 'city': 'ROSEVILLE', 'zip': '95678', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft': '2100', 'type': 'Condo', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '350000', 'latitude': '38.762153', 'longitude': '-121.283451'}
This is row number 816
{'street': '7709 RIVER VILLAGE DR', 'city': 'SACRAMENTO', 'zip': '95831', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft': '1795', 'type': 'Residential', 'sale_date': 'Fri May 16 00:00:00 EDT 2008', 'price': '351000', 'latitude': '38.483212', 'longitude': '-121.54019'}
This is row number 817
{'street': '4165 BRISBANE CIR', 'city': 'EL DORADO HILLS', 'zip': '95762', 'state': 'CA', 'beds': '3', 'baths': '2', 'sq__ft':

---

**Tuple unpacking**

In [8]:
a = ["a", "b", "c"]
list(enumerate(a))

[(0, 'a'), (1, 'b'), (2, 'c')]

In [9]:
l = ("Hello", "World")

In [10]:
h = l[0]
w = l[1]
print(h)
print(w)

Hello
World


In [11]:
h, w = l
print(h)
print(w)

Hello
World


In [12]:
h, w = ("Hello", "World")

---

In [13]:
data = []
with open("data/sacramento_data.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    
    for i, row in enumerate(reader):
        data.append(row)

In [14]:
data[0]

{'street': '3526 HIGH ST',
 'city': 'SACRAMENTO',
 'zip': '95838',
 'state': 'CA',
 'beds': '2',
 'baths': '1',
 'sq__ft': '836',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '59222',
 'latitude': '38.631913',
 'longitude': '-121.434879'}

In [15]:
data[0]["beds"] + data[1]["beds"]

'23'

### Tasks

- Capitalize `city`
- Convert `beds`, `baths`, `sq__ft`, `price` to `int`
- Convert `latitude`, `longitude` to `float`
- Convert `sq__ft` to `sq_m`
- Turn `sale_date` into date format 'YYYY-MM-DD'

After processing, export to clean csv file.

We start the exercise with one row only

In [16]:
sample = data[23]

In [17]:
sample

{'street': '9 PASTURE CT',
 'city': 'SACRAMENTO',
 'zip': '95834',
 'state': 'CA',
 'beds': '3',
 'baths': '2',
 'sq__ft': '1601',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '124100',
 'latitude': '38.628631',
 'longitude': '-121.488097'}

## 1. Capitalize `city`

In [18]:
sample["city"].capitalize()

'Sacramento'

## 2. Convert `beds` to `int`

In [19]:
int(sample["beds"])

3

In [20]:
def to_int(row, column):
    return int(row[column])

In [21]:
to_int(sample, "beds")

3

In [22]:
to_int(sample, "price")

124100

Same approach for all other columns we want to turn into `int`

## 3. Convert `latitude` to `float`

In [23]:
float(sample["latitude"])

38.628631

In [24]:
def to_float(row, column):
    return float(row[column])

In [25]:
to_float(sample, "latitude")

38.628631

## 4. Convert the value for `sq__ft` to a value with unit sqm

In [26]:
round(int(sample["sq__ft"]) / 10.764, 1)

148.7

In [27]:
def to_sqm(row, column):
    return round(int(row[column]) / 10.764, 1)

In [28]:
to_sqm(sample, "sq__ft")

148.7

# 5. Turn `sale_date` into 'YYYY-MM-DD'

In [29]:
sample["sale_date"]

'Wed May 21 00:00:00 EDT 2008'

In [30]:
import re

###  5.1. Extract year

In [31]:
sample["sale_date"]

'Wed May 21 00:00:00 EDT 2008'

In [32]:
year_pattern = r"\d{4}$"
year, = re.findall(year_pattern, sample["sale_date"])

In [33]:
year

'2008'

In [34]:
def extract_year(row, column):
    year_pattern = r"\d{4}$"
    year, = re.findall(year_pattern, row[column])
    return year

In [35]:
extract_year(sample, "sale_date")

'2008'

### 5.2. Extract day

In [36]:
day_pattern = r"\s(\d{2})\s"
day, = re.findall(day_pattern, sample["sale_date"])

In [37]:
day

'21'

In [38]:
def extract_day(row, column):
    day_pattern = r"\s(\d{2})\s"
    day, = re.findall(day_pattern, row[column])
    return day

In [39]:
extract_day(sample, "sale_date")

'21'

---

In [40]:
a, b = [1,2]
print(a)
print(b)

1
2


In [41]:
c, = [10]
print(c)

10


----

### 5.3. Extract month

In [42]:
sample["sale_date"]

'Wed May 21 00:00:00 EDT 2008'

In [43]:
month_pattern = r"^[A-Z][a-z]{2}\s([A-Z][a-z]+)\s"
month_str, = re.findall(month_pattern, sample["sale_date"])

In [44]:
test_month = 'Sun September 21 00:00:00 EDT 2008'
re.findall(month_pattern, test_month)

['September']

In [45]:
month_str

'May'

In [46]:
month_to_digit = {"January": "01",
                  "February": "02",
                  "March": "03",
                  "April": "04",
                  "May": "05",
                  "June": "06",
                  "July": "07",
                  "August": "08",
                  "September": "09",
                  "October": "10",
                  "November": "11",
                  "December": "12"}

In [47]:
month = month_to_digit[month_str]

In [48]:
def extract_month(row, column):
    month_to_digit = {"January": "01",
                  "February": "02",
                  "March": "03",
                  "April": "04",
                  "May": "05",
                  "June": "06",
                  "July": "07",
                  "August": "08",
                  "September": "09",
                  "October": "10",
                  "November": "11",
                  "December": "12"}
    month_pattern = r"^[A-Z][a-z]{2}\s([A-Z][a-z]+)\s"
    
    month_str, = re.findall(month_pattern, row[column])
    month = month_to_digit[month_str]
    return month

In [49]:
extract_month(sample, "sale_date")

'05'

### 5.4. Combine date parts

In [50]:
year + "-" + month + "-" + day

'2008-05-21'

In [51]:
"-".join([year, month, day])

'2008-05-21'

In [52]:
def extract_date(row, column):
    return "-".join([extract_year(row, column),
                     extract_month(row, column),
                     extract_day(row, column)])

In [53]:
extract_date(sample, "sale_date")

'2008-05-21'

## Wrap all individual functions into one object

In [54]:
conversions = {"zip": int,
               "beds": int,
               "baths": int}

In [55]:
sample

{'street': '9 PASTURE CT',
 'city': 'SACRAMENTO',
 'zip': '95834',
 'state': 'CA',
 'beds': '3',
 'baths': '2',
 'sq__ft': '1601',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '124100',
 'latitude': '38.628631',
 'longitude': '-121.488097'}

---

In [56]:
s = {"a": 1, "b": 2}
for i in s:
    print(i)
    print(s[i])

a
1
b
2


In [57]:
s = {"a": 1, "b": 2}
for key, value in s.items():
    print(key)
    print(value)

a
1
b
2


In [58]:
list(s.items())

[('a', 1), ('b', 2)]

---

In [59]:
sample

{'street': '9 PASTURE CT',
 'city': 'SACRAMENTO',
 'zip': '95834',
 'state': 'CA',
 'beds': '3',
 'baths': '2',
 'sq__ft': '1601',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '124100',
 'latitude': '38.628631',
 'longitude': '-121.488097'}

Refactor functions to take values instead of dict and column

In [60]:
def to_sqm(value):
    return round(int(value) / 10.764, 1)

def extract_year(value):
    year_pattern = r"\d{4}$"
    year, = re.findall(year_pattern, value)
    return year

def extract_month(value):
    month_to_digit = {"January": "01",
                  "February": "02",
                  "March": "03",
                  "April": "04",
                  "May": "05",
                  "June": "06",
                  "July": "07",
                  "August": "08",
                  "September": "09",
                  "October": "10",
                  "November": "11",
                  "December": "12"}
    month_pattern = r"^[A-Z][a-z]{2}\s([A-Z][a-z]+)\s"
    
    month_str, = re.findall(month_pattern, value)
    month = month_to_digit[month_str]
    return month

def extract_day(value):
    day_pattern = r"\s(\d{2})\s"
    day, = re.findall(day_pattern, value)
    return day

def extract_date(value):
    return "-".join([extract_year(value),
                     extract_month(value),
                     extract_day(value)])

def to_capitalize(value):
    return value.capitalize()

In [61]:
"SACRAMENTO".capitalize()

'Sacramento'

In [62]:
to_capitalize("Sacramento")

'Sacramento'

In [63]:
conversions = {"beds": int,
               "baths": int,
               "price": int,
               "sq__ft": to_sqm,
               "sale_date": extract_date,
               "latitude": float,
               "longitude": float,
               "city": lambda x: x.capitalize()}

In [64]:
sample

{'street': '9 PASTURE CT',
 'city': 'SACRAMENTO',
 'zip': '95834',
 'state': 'CA',
 'beds': '3',
 'baths': '2',
 'sq__ft': '1601',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '124100',
 'latitude': '38.628631',
 'longitude': '-121.488097'}

In [65]:
new_sample = {}
for column, value in sample.items():
    if column in conversions:
        new_sample[column] = conversions[column](value)
    else:
        new_sample[column] = value

In [66]:
new_sample

{'street': '9 PASTURE CT',
 'city': 'Sacramento',
 'zip': '95834',
 'state': 'CA',
 'beds': 3,
 'baths': 2,
 'sq__ft': 148.7,
 'type': 'Residential',
 'sale_date': '2008-05-21',
 'price': 124100,
 'latitude': 38.628631,
 'longitude': -121.488097}

In [67]:
def process_row(row):
    conversions = {"beds": int,
               "baths": int,
               "price": int,
               "sq__ft": to_sqm,
               "sale_date": extract_date,
               "latitude": float,
               "longitude": float,
               "city": lambda x: x.title()}
    
    new_row = {}
    for column, value in row.items():
        if column in conversions:
            new_row["sq_m" if column == "sq__ft" else column] = conversions[column](value)
        else:
            new_row[column] = value
            
    return new_row

In [68]:
column = "sq__ft"
"sq_m" if column == "sq__ft" else column # ternary operator

'sq_m'

In [69]:
process_row(data[101])

{'street': '4236 NATOMAS CENTRAL DR',
 'city': 'Sacramento',
 'zip': '95834',
 'state': 'CA',
 'beds': 3,
 'baths': 2,
 'sq_m': 155.3,
 'type': 'Condo',
 'sale_date': '2008-05-21',
 'price': 265000,
 'latitude': 38.648879,
 'longitude': -121.544023}

In [70]:
type(data)

list

In [71]:
[process_row(row) for row in data]

[{'street': '3526 HIGH ST',
  'city': 'Sacramento',
  'zip': '95838',
  'state': 'CA',
  'beds': 2,
  'baths': 1,
  'sq_m': 77.7,
  'type': 'Residential',
  'sale_date': '2008-05-21',
  'price': 59222,
  'latitude': 38.631913,
  'longitude': -121.434879},
 {'street': '51 OMAHA CT',
  'city': 'Sacramento',
  'zip': '95823',
  'state': 'CA',
  'beds': 3,
  'baths': 1,
  'sq_m': 108.4,
  'type': 'Residential',
  'sale_date': '2008-05-21',
  'price': 68212,
  'latitude': 38.478902,
  'longitude': -121.431028},
 {'street': '2796 BRANCH ST',
  'city': 'Sacramento',
  'zip': '95815',
  'state': 'CA',
  'beds': 2,
  'baths': 1,
  'sq_m': 74.0,
  'type': 'Residential',
  'sale_date': '2008-05-21',
  'price': 68880,
  'latitude': 38.618305,
  'longitude': -121.443839},
 {'street': '2805 JANETTE WAY',
  'city': 'Sacramento',
  'zip': '95815',
  'state': 'CA',
  'beds': 2,
  'baths': 1,
  'sq_m': 79.2,
  'type': 'Residential',
  'sale_date': '2008-05-21',
  'price': 69307,
  'latitude': 38.616835,

Homework: Export as csv