In [1]:
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
# read in the file downloaded from google drive where we manually labelled
sample = pd.read_csv('../data/training/to_label_3000 - to_label_3000.csv', index_col= 0)

In [66]:
sample = sample[~sample.AERIAL_Driveway.isna()]
sample = sample[~sample.GSV_Driveway.isna()]

In [67]:
sample.head()

Unnamed: 0,MBL,sv_addr,aerial_addr,AERIAL_Driveway,GSV_Driveway
1648,72-C-11,"1 ALDERSEY ST, Somerville, MA",1 ALDERSEY ST,1.0,2.0
2491,91-D-18,"1 ARLINGTON ST, Somerville, MA",1 ARLINGTON ST,0.0,0.0
2021,76-E-9,"1 BEACON ST, Somerville, MA",1 BEACON ST,1.0,1.0
309,2-A-33,"1 CAPEN CT, Somerville, MA",1 CAPEN CT,1.0,1.0
1080,53-D-16,"1 ELIOT ST, Somerville, MA",1 ELIOT ST,0.0,0.0


### labels
- 0 = no driveway 
- 1 = driveway 
- 2 = unsure

In [68]:
def overall_labelling(y):
    y1, y2= y[0], y[1]
    
    ### ALL CERTAIN ###
    # both have driveways
    if y1 ==1.0 and y2 == 1.0:
        return 1.0
    # both don't have driveways
    if y1 == 0.0 and y2 == 0.0:
        return 0.0
    # aerial has driveway, streetview doesn't have driveway
    if y1 ==1.0 and y2 == 0.0:
        return 0.9
    # aerial doesn't have driveway, streetview has driveway
    if y1 == 0.0 and y2 == 1.0:
        return 0.9
    
    ### ALL UNCERTAIN ###
    if y1 == 2.0 and y2 == 2.0:
        return 0.5
    
    ### ONE UNCERTAIN ###

    if y1 == 2.0 and y2 != 2.0:
        if y2 == 1.0:
            return 0.9
        if y2 == 0.0:
            return 0.1
    
    if y1 != 2.0 and y2 == 2.0:
        if y1 == 1.0:
            return 0.9
        if y1 == 0.0:
            return 0.1

In [69]:
def overall_labelling_3classes(y):
    y1, y2= y[0], y[1]
    
    ### ALL CERTAIN ###
    # both have driveways
    if y1 ==1.0 and y2 == 1.0:
        return 1
    # both don't have driveways
    if y1 == 0.0 and y2 == 0.0:
        return 0
    
    # DISAGREEMENT
    # aerial has driveway, streetview doesn't have driveway
    if y1 ==1.0 and y2 == 0.0:
        return 2
    # aerial doesn't have driveway, streetview has driveway
    if y1 == 0.0 and y2 == 1.0:
        return 2
    
    ### ALL UNCERTAIN ###
    if y1 == 2.0 and y2 == 2.0:
        return 2
    
    ### ONE UNCERTAIN ###

    if y1 == 2.0 and y2 != 2.0:
        if y2 == 1.0:
            return 1
        if y2 == 0.0:
            return 0
    
    if y1 != 2.0 and y2 == 2.0:
        if y1 == 1.0:
            return 1
        if y1 == 0.0:
            return 0

In [70]:
sample['final_label'] = sample[['AERIAL_Driveway','GSV_Driveway']].apply(overall_labelling, axis = 1)

In [71]:
sample['three_class_label'] = sample[['AERIAL_Driveway','GSV_Driveway']].apply(overall_labelling_3classes, axis = 1)

In [72]:
sample['aerial_filename'] = sample['aerial_addr'].str.lower().replace(' ', '_', regex=True) + '_aerial.png'
sample['gsv_filename'] = sample.sv_addr.str.lower().replace(', somerville, ma', '', regex=True).replace(' ', '_', regex=True) + '.jpg'

In [97]:
sample.loc[sample.gsv_filename == '2-4_brook_st.jpg', 'gsv_filename'] = '4_brook_st.jpg'
sample.loc[1142, 'aerial_filename'] = '3-9_hanson_st_aerial.png'

In [91]:
`x

In [74]:
# manual fixes
# sample.loc[sample.MBL == '75-C-24', 'aerial_filename'] = '1-3_clark_st_aerial.png'

In [82]:
sample.iloc[-76:-68, -2] = sample.iloc[-76:-68, -1].str.replace('.jpg', '_aerial.png', regex=True)

In [98]:
counter = 0
for i in sample.gsv_filename:
    counter +=1
    plt.imread('../data/training/sv_images/' + str(i))

In [101]:
counter, sample.shape[0]

(162, 162)

In [102]:
sample.to_csv('../labels/additional_training_labels_120319.csv', index=False)

----- 
### Redfin Data

In [2]:
redfin = pd.read_csv('../data/training/redfin_clean.csv', index_col='Unnamed: 0')

In [3]:
redfin['aerial_filename'] = (redfin.SITE_ADDR.str.lower().replace(
    ', somerville, ma', '', regex = True)).replace(' ', '_', regex = True)+ '_aerial.png'
redfin['gsv_filename'] = (redfin.SITE_ADDR.str.lower().replace(
    ', somerville, ma', '', regex = True)).replace(' ', '_', regex = True) + '.jpg'

In [5]:
check_aer = list(redfin['aerial_filename'])
check_gsv = list(redfin['gsv_filename'])

In [6]:
from os import walk
aer_fn = []
for (dirpath, dirnames, filenames) in walk('../data/aerial_all/'):
    aer_fn.extend(filenames)
    break
sv_fn = []
for (dirpath, dirnames, filenames) in walk('../data/street_view_all/'):
    sv_fn.extend(filenames)
    break

In [7]:
from os import listdir
from PIL import Image

bad_images = []
   
for filename in listdir('../data/street_view_all/'):
    if filename.endswith('.jpg'):
        try:
            img = Image.open('../data/street_view_all/'+filename) # open the image file
            img.verify() # verify that it is, in fact an image
        except (IOError, SyntaxError) as e:
            bad_images.append(filename)

good_sv_fn = [x for x in sv_fn if x not in bad_images]

In [9]:
available_aer = [x for x in check_aer if x in aer_fn]
available_sv = [x for x in check_gsv if x in good_sv_fn]

In [11]:
final_aer = redfin[redfin.aerial_filename.isin(available_aer)]
final_aer.to_csv('../labels/redfin_training_labels_aerial.csv')

In [10]:
final_sv = redfin[redfin.gsv_filename.isin(available_sv)]
final_sv.to_csv('../labels/redfin_training_labels_sv.csv')

In [14]:
final_aer.has_parking.value_counts()

1    4514
0     369
Name: has_parking, dtype: int64

In [13]:
final_sv.has_parking.value_counts()

1    5059
0     445
Name: has_parking, dtype: int64

 ---