In [40]:
import numpy as np
import pandas as pd
import re

In [41]:
df = pd.read_json('../collection/raw_data/craigslist.json')

# remove entries with price not listed; convert price from string to int
df = df[pd.notnull(df['price'])]
df.price = df.price.str[1:].astype(float)

# remove fluff from beginning for description
df['description'] = df['description'].apply(lambda x: x[28:])

In [42]:
np.unique(np.array(df.tags.sum()))

array(['0BR / 1Ba', '0BR / 2Ba', '1000ft2', '1005ft2', '1006ft2',
       '1011ft2', '1012ft2', '1013ft2', '1018ft2', '1023ft2', '1025ft2',
       '1026ft2', '1028ft2', '1029ft2', '1032ft2', '1034ft2', '1035ft2',
       '1042ft2', '1049ft2', '1050ft2', '1055ft2', '1056ft2', '1057ft2',
       '1061ft2', '1072ft2', '1075ft2', '1086ft2', '1087ft2', '1088ft2',
       '1090ft2', '1094ft2', '1099ft2', '1100ft2', '1104ft2', '1108ft2',
       '1125ft2', '1130ft2', '1140ft2', '1146ft2', '1150ft2', '1152ft2',
       '1153ft2', '1159ft2', '1175ft2', '1180ft2', '1183ft2', '1187ft2',
       '1194ft2', '1200ft2', '1212ft2', '1218ft2', '1226ft2', '1236ft2',
       '1243ft2', '1247ft2', '1250ft2', '1266ft2', '1275ft2', '1276ft2',
       '1280ft2', '1286ft2', '1288ft2', '1300ft2', '1306ft2', '1309ft2',
       '1333ft2', '1334ft2', '1347ft2', '1350ft2', '1360ft2', '1374ft2',
       '1379ft2', '1380ft2', '1386ft2', '1390ft2', '1400ft2', '1402ft2',
       '1408ft2', '1416ft2', '1420ft2', '1430ft2', '1450ft

In [43]:
bed, bath, pets, laundry, parking, property_type, area, company, neighborhood, utilities, year_built = ([] for i in range(11))
for item in df.tags.values:
    room_flag, pets_flag, laundry_flag, parking_flag, property_flag, area_flag, company_flag = ([True] * 7)

    for tag in item:

        # bed, bath
        bed_bath = re.search('(\d*\.?\d)BR \/ (\d*\.?\d)Ba', tag)
        if not room_flag:
            pass
        elif bed_bath != None and room_flag:
            room_flag = False
            bed.append(bed_bath.group(1))
            bath.append(float(bed_bath.group(2)))
        
        # pets
        pets_ = re.search('(cats)|(dogs)', tag)
        if not pets_flag:
            pass
        elif pets_ != None and pets_flag:
            pets_flag = False
            pets.append(1)
        
        # laundry
        if not laundry_flag:
            pass
        laundry_ = re.search('(?<!no\s)laundry|(w\/d)', tag)
        if laundry != None and laundry_flag:
            laundry_flag = False
            laundry.append(1)
        
        # parking 
        if not parking_flag:
            pass
        parking_ = re.search('(?<!no )parking|garage|carport', tag)
        if parking_ != None and parking_flag: 
            parking_flag = False
            parking.append(1)
        
        # area
        if not area_flag:
            pass
        area_ = re.search('(\d*)ft2', tag)
        if area_ != None and area_flag:
            area_flag = False
            area.append(float(area_.group(1)))
            
        # property_type
        if not property_flag:
            pass
        property_ = re.search('(?<! )(house|apartment|duplex|townhouse|condo|cottage\/cabin|flat)', tag)
        if property_ != None and property_flag:
            property_flag = False
            property_type.append(property_.group(1))
            
        # company
        if not company_flag:
            pass
        company_ = re.search('listed by:(.*)', tag)
        if company_ != None and company_flag:
            company_flag = False
            company.append(company_.group(1))

    if room_flag:
        bed.append(np.nan)
        bath.append(np.nan)
    if laundry_flag:
        laundry.append(0)
    if pets_flag:
        pets.append(0)
    if parking_flag:
        parking.append(0)
    if area_flag:
        area.append(0)
    if property_flag:
        property_type.append(None)
    if company_flag:
        company.append(None)
    neighborhood.append(None)
    year_built.append(np.nan)
    utilities.append([])

In [44]:
df['bed'] = bed
df['bath'] = bath
df['pets'] = pets
df['laundry'] = laundry
df['parking'] = parking
df['area'] = area
df['property_type'] = property_type
df['company'] = company
df['neighborhood'] = neighborhood
df['utilities'] = utilities
df['year_built'] = year_built
df.drop('tags', axis=1)

Unnamed: 0,price,description,images,posted_datetime,address,bed,bath,pets,laundry,parking,area,property_type,company,neighborhood,utilities,year_built
0,1095.0,This bedroom apartments includes:\n\n→ Contemp...,[https://images.craigslist.org/00S0S_8ZkasrSZ6...,2020-01-21T10:50:37-0500,3050 Birch Hollow Dr,2,1.0,1,1,1,876.0,apartment,,,[],
1,2000.0,"Stunning Ann Arbor home, perfect family house ...",[https://images.craigslist.org/00Y0Y_4tey2xf24...,2020-01-07T19:39:04-0500,912 Rose Ave,4,2.5,1,1,1,2000.0,house,,,[],
2,695.0,"AWESOME Affordable. This stunning 3 bedroom, 1...",[],2020-01-17T14:57:45-0500,,3,2.0,0,1,1,0.0,apartment,,,[],
3,1385.0,Have you heard about our Stars & Stripes progr...,[https://images.craigslist.org/00p0p_1PnkFaSsW...,2020-01-08T09:49:12-0500,1505 Natalie Lane near Kipling Drive,1,1.0,1,1,1,800.0,apartment,,,[],
4,1185.0,Have you heard about our Stars & Stripes progr...,[https://images.craigslist.org/00k0k_7n7PgfXF5...,2020-01-15T09:42:20-0500,1505 Natalie Lane near Kipling Drive,1,1.0,1,1,1,800.0,apartment,,,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1644.0,Orion NorthStar show contact info\n Unit D...,[https://images.craigslist.org/00Q0Q_jicpO71NG...,2019-12-23T09:49:54-0500,2820 Windwood,2,2.0,1,1,1,943.0,apartment,,,[],
2996,1145.0,"Centrally located, spacious floorplan, huge ki...",[https://images.craigslist.org/00U0U_buQtes1a1...,2019-12-16T14:48:38-0500,553 S 4th Ave,1,1.0,1,1,0,0.0,apartment,,,[],
2997,1450.0,Great Amenities:\n*No Smoking\n*Sparkling Swim...,[https://images.craigslist.org/00I0I_9YIVPzxaZ...,2019-12-23T09:43:36-0500,600 Hidden Valley Club Drive near State Street,2,2.0,1,1,0,916.0,apartment,,,[],
2998,989.0,Lynden ParkeWe're located at: 2224 Golfside Dr...,[https://images.craigslist.org/00W0W_bKfxfWQrA...,2019-12-23T09:41:43-0500,,1,1.0,1,1,0,658.0,apartment,,,[],


In [49]:
col = ['address', 'price', 'bed', 'bath', 'area', 'company', 'neighborhood', 'laundry', 'pets', 'parking', 'utilities', 'property_type', 'year_built', 'description', 'images']
df = df[col]

In [50]:
df

Unnamed: 0,address,price,bed,bath,area,company,neighborhood,laundry,pets,parking,utilities,property_type,year_built,description,images
0,3050 Birch Hollow Dr,1095.0,2,1.0,876.0,,,1,1,1,[],apartment,,This bedroom apartments includes:\n\n→ Contemp...,[https://images.craigslist.org/00S0S_8ZkasrSZ6...
1,912 Rose Ave,2000.0,4,2.5,2000.0,,,1,1,1,[],house,,"Stunning Ann Arbor home, perfect family house ...",[https://images.craigslist.org/00Y0Y_4tey2xf24...
2,,695.0,3,2.0,0.0,,,1,0,1,[],apartment,,"AWESOME Affordable. This stunning 3 bedroom, 1...",[]
3,1505 Natalie Lane near Kipling Drive,1385.0,1,1.0,800.0,,,1,1,1,[],apartment,,Have you heard about our Stars & Stripes progr...,[https://images.craigslist.org/00p0p_1PnkFaSsW...
4,1505 Natalie Lane near Kipling Drive,1185.0,1,1.0,800.0,,,1,1,1,[],apartment,,Have you heard about our Stars & Stripes progr...,[https://images.craigslist.org/00k0k_7n7PgfXF5...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2820 Windwood,1644.0,2,2.0,943.0,,,1,1,1,[],apartment,,Orion NorthStar show contact info\n Unit D...,[https://images.craigslist.org/00Q0Q_jicpO71NG...
2996,553 S 4th Ave,1145.0,1,1.0,0.0,,,1,1,0,[],apartment,,"Centrally located, spacious floorplan, huge ki...",[https://images.craigslist.org/00U0U_buQtes1a1...
2997,600 Hidden Valley Club Drive near State Street,1450.0,2,2.0,916.0,,,1,1,0,[],apartment,,Great Amenities:\n*No Smoking\n*Sparkling Swim...,[https://images.craigslist.org/00I0I_9YIVPzxaZ...
2998,,989.0,1,1.0,658.0,,,1,1,0,[],apartment,,Lynden ParkeWe're located at: 2224 Golfside Dr...,[https://images.craigslist.org/00W0W_bKfxfWQrA...


In [52]:
df.to_json(r'../data/cleaned_data/craigslist.json')