In [18]:
import re

import pandas as pd

In [7]:
raw_data = pd.read_json("scraper/apts.json")
raw_data

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,...,bike_score,description,rating,build_year,application_fee,admin_fee,parking,model,unit,available
0,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1634.0,0,1.0,400,Cathedral Heights,71,...,42,The Berkshire embodies the timeless art of liv...,3.9,1950,85.0,500.0,"[{'title': 'Surface Lot', 'cost': None, 'descr...",EFFMR,Unit 1025,Now
1,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1696.0,0,1.0,400,Cathedral Heights,71,...,42,The Berkshire embodies the timeless art of liv...,3.9,1950,85.0,500.0,"[{'title': 'Surface Lot', 'cost': None, 'descr...",EFFMR,Unit 4091,Now
2,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1717.0,0,1.0,400,Cathedral Heights,71,...,42,The Berkshire embodies the timeless art of liv...,3.9,1950,85.0,500.0,"[{'title': 'Surface Lot', 'cost': None, 'descr...",EFFMR,Unit 6044,Now
3,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1634.0,0,1.0,400,Cathedral Heights,71,...,42,The Berkshire embodies the timeless art of liv...,3.9,1950,85.0,500.0,"[{'title': 'Surface Lot', 'cost': None, 'descr...",EFFMR,Unit 3086,Mar. 13
4,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1649.0,0,1.0,400,Cathedral Heights,71,...,42,The Berkshire embodies the timeless art of liv...,3.9,1950,85.0,500.0,"[{'title': 'Surface Lot', 'cost': None, 'descr...",EFFMR,Unit A324,Mar. 13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18347,The Hecht Warehouse,1401 New York Ave NE,DC,20002,2436.0,2,2.0,811,Ivy City,80,...,64,Introducing a community that’s truly one of a ...,4.7,1937,75.0,200.0,"[{'title': 'Garage', 'cost': '$125 - $150', 'd...",2M,Unit 626,Apr. 1
18348,The Hecht Warehouse,1401 New York Ave NE,DC,20002,2451.0,2,2.0,983,Ivy City,80,...,64,Introducing a community that’s truly one of a ...,4.7,1937,75.0,200.0,"[{'title': 'Garage', 'cost': '$125 - $150', 'd...",2A,Unit 325,Apr. 8
18349,The Hecht Warehouse,1401 New York Ave NE,DC,20002,2501.0,2,2.0,983,Ivy City,80,...,64,Introducing a community that’s truly one of a ...,4.7,1937,75.0,200.0,"[{'title': 'Garage', 'cost': '$125 - $150', 'd...",2A,Unit 549,Apr. 8
18350,The Hecht Warehouse,1401 New York Ave NE,DC,20002,2741.0,2,2.0,1131,Ivy City,80,...,64,Introducing a community that’s truly one of a ...,4.7,1937,75.0,200.0,"[{'title': 'Garage', 'cost': '$125 - $150', 'd...",2G,Unit 609,May 6


In [17]:
data = raw_data.drop_duplicates(subset=['property_name', 'address', 'state', 'model', 'unit', 'price', 'square_feet']).reset_index(drop=True)
print(f"Total: {len(raw_data)}")
print(f"Unique: {len(data)}")

Total: 18352
Unique: 9164


In [47]:
def extract_parking_stats(parking_options: list) -> dict:
    parking_titles = set()
    parking_costs = list()
    for option in parking_options:
        parking_titles.add(option["title"].title())
        cost = option["cost"]
        if cost:
            if "-" in cost:
                cost = cost.split("-")[-1]
            cost = int(re.sub('\D', '', str(cost)))
            parking_costs.append(cost)
    return {
        "has_parking": len(parking_titles - {"Other", "Street"}) > 0,
        "has_garage_parking": "Garage" in parking_titles,
        "cost": min(parking_costs) if len(parking_costs) > 0 else None
    }

In [57]:
data = data.join(data["parking"].apply(extract_parking_stats).pipe(pd.json_normalize)).drop(columns=["parking"])
data.head()

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,...,rating,build_year,application_fee,admin_fee,model,unit,available,has_parking,has_garage_parking,cost
0,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1634.0,0,1.0,400,Cathedral Heights,71,...,3.9,1950,85.0,500.0,EFFMR,Unit 1025,Now,True,True,185.0
1,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1696.0,0,1.0,400,Cathedral Heights,71,...,3.9,1950,85.0,500.0,EFFMR,Unit 4091,Now,True,True,185.0
2,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1717.0,0,1.0,400,Cathedral Heights,71,...,3.9,1950,85.0,500.0,EFFMR,Unit 6044,Now,True,True,185.0
3,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1634.0,0,1.0,400,Cathedral Heights,71,...,3.9,1950,85.0,500.0,EFFMR,Unit 3086,Mar. 13,True,True,185.0
4,The Berkshire,4201 Massachusetts Ave NW,DC,20016,1649.0,0,1.0,400,Cathedral Heights,71,...,3.9,1950,85.0,500.0,EFFMR,Unit A324,Mar. 13,True,True,185.0


In [74]:
data = data[~data["price"].isna()].reset_index(drop=True)
data["square_feet"] = data["square_feet"].replace({"": None}).astype("Int64")

## Analysis

In [65]:
data.groupby("neighborhood").size().sort_values()

neighborhood
Lake Barcroft                 4
Waverly Hills                 5
Georgetown/Wisconsin Ave      5
Arlington Heights             6
Downtown DC                   7
                           ... 
National Landing            412
Capitol Riverfront          428
Union Market                451
NoMa                        795
Alexandria West             928
Length: 88, dtype: int64

In [63]:
data.groupby("neighborhood")["price"].agg("mean").sort_values()

neighborhood
Arlandria                   1499.065217
Oxon Hill                   1540.692308
Georgetown/Wisconsin Ave    1705.000000
Beverly Hills               1761.916667
Huntington                  1819.311828
                               ...     
The Wharf                   3689.857143
Cathedral Heights           3915.160377
Woodley Park                4256.642857
Navy Yard                   4624.750000
Eastern Market              5787.692308
Name: price, Length: 88, dtype: float64

In [75]:
data["price_per_foot"] = data["price"].div(data["square_feet"])
data.groupby("neighborhood")["price_per_foot"].agg("mean").sort_values()

neighborhood
Oxon Hill                     1.705912
Brookville/Seminary Valley    1.746344
Lincolnia                     1.926686
Seminary Hill                 2.121428
Cheverly                      2.125179
                                ...   
Eastern Market                4.471556
U Street                      4.567474
The Wharf                     4.596896
West End                      4.608898
Dupont Circle                 10.64597
Name: price_per_foot, Length: 88, dtype: Float64

In [78]:
raw_data[raw_data["address"].str.contains("1425 21st St")]

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,...,bike_score,description,rating,build_year,application_fee,admin_fee,parking,model,unit,available


In [76]:
data[data["neighborhood"] == "Dupont Circle"]

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,...,build_year,application_fee,admin_fee,model,unit,available,has_parking,has_garage_parking,cost,price_per_foot
1406,Westpark Apartments,2130 P St NW,DC,20037,2140.0,0,1.0,450,Dupont Circle,98,...,1975,50.0,600.0,Studio 2,Unit 00704,Now,True,True,275.0,4.755556
1429,Westpark Apartments,2130 P St NW,DC,20037,2287.0,0,1.0,470,Dupont Circle,98,...,1975,50.0,600.0,Studio 4,Unit 00426,Now,True,True,275.0,4.865957
1430,Westpark Apartments,2130 P St NW,DC,20037,2443.0,0,1.0,470,Dupont Circle,98,...,1975,50.0,600.0,Studio 4,Unit 01026,Apr. 15,True,True,275.0,5.197872
1431,Westpark Apartments,2130 P St NW,DC,20037,2357.0,0,1.0,450,Dupont Circle,98,...,1975,50.0,600.0,Studio,Unit 00217,Now,True,True,275.0,5.237778
1432,Westpark Apartments,2130 P St NW,DC,20037,2351.0,0,1.0,450,Dupont Circle,98,...,1975,50.0,600.0,Studio,Unit 00821,Mar. 15,True,True,275.0,5.224444
1433,Westpark Apartments,2130 P St NW,DC,20037,2438.0,0,1.0,450,Dupont Circle,98,...,1975,50.0,600.0,Studio,Unit 01021,Mar. 17,True,True,275.0,5.417778
1434,Westpark Apartments,2130 P St NW,DC,20037,2828.0,1,1.0,670,Dupont Circle,98,...,1975,50.0,600.0,1 Bed 1 Bath _670A,Unit 00520,Now,True,True,275.0,4.220896
1435,Westpark Apartments,2130 P St NW,DC,20037,2889.0,1,1.0,670,Dupont Circle,98,...,1975,50.0,600.0,1 Bed 1 Bath _670A,Unit 00820,Now,True,True,275.0,4.31194
1436,Westpark Apartments,2130 P St NW,DC,20037,2897.0,1,1.0,670,Dupont Circle,98,...,1975,50.0,600.0,1 Bed 1 Bath _670A,Unit 00920,Now,True,True,275.0,4.323881
1437,Westpark Apartments,2130 P St NW,DC,20037,2859.0,1,1.0,670,Dupont Circle,98,...,1975,50.0,600.0,1 Bed 1 Bath _670A,Unit 01012,Mar. 16,True,True,275.0,4.267164


In [87]:
dc_apts = pd.read_json("scraper/apts_dc.json").drop_duplicates(subset=['property_name', 'address', 'state', 'model', 'unit', 'price', 'square_feet'])
len(dc_apts)

13835

In [89]:
x = dc_apts[dc_apts["neighborhood"] == "Dupont Circle"]["property_name"].unique()
x

array(['Westpark Apartments', 'The Sedgewick', 'Rocksboro Apartments',
       'Gables Dupont Circle', '1816 New Hampshire Ave NW Unit 707',
       '1601 18th St NW Unit 818', '1727 Massachusetts Ave NW Unit 709',
       '1260 21st St NW Unit Apt 707', 'The Croydon', 'President Madison',
       'Bristol House', 'Common Parkway'], dtype=object)

In [90]:
y = pd.read_json("scraper/apts_dupont.json")
y[y["neighborhood"] == "Dupont Circle"]["property_name"].unique()

array(['Westpark Apartments', 'The Croydon', 'President Madison',
       'Bristol House', '1601 18th St NW Unit 818',
       '1816 New Hampshire Ave NW Unit 707', 'Gables Dupont Circle',
       'Rocksboro Apartments', '1727 Massachusetts Ave NW Unit 709',
       '1415 17th St NW Unit FL2-ID25',
       'Full bedroom in 4 bed/2 bath Home',
       'Full bedroom in 5 bed/2 bath Home',
       '1415 17th St NW Unit FL0-ID62', '2107 O St NW Unit FL1-ID340',
       '1830 17th St NW Unit 201', '1330 New Hampshire Ave NW Unit 607',
       '1545 18th St NW Unit 205', '1330 New Hampshire Ave NW Unit 902',
       '1724 21st St NW', '2130 N St NW Unit 310', '1507 R St NW',
       '1828 Riggs Pl NW Unit 1', '1301 20th St NW Unit 916',
       '2025 N St NW', '1325 18th St NW Unit 902',
       '1425 21st St NW Unit 101', '2029 O St NW',
       '1750 Corcoran St NW Unit B', '1260 21st St NW Unit 910',
       '2 br, 2 bath House - 2122 N St NW #3',
       '1 br, 1 bath House - 2113 O Street Unit 2113B',


In [88]:
dc_apts[dc_apts["neighborhood"] == "Dupont Circle"]

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,...,bike_score,description,rating,build_year,application_fee,admin_fee,parking,model,unit,available
3939,Westpark Apartments,2130 P St NW,DC,20037.0,2096.0,0,1.0,450,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio 2,Unit 00704,Now
3968,Westpark Apartments,2130 P St NW,DC,20037.0,2241.0,0,1.0,470,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio 4,Unit 00426,Now
3969,Westpark Apartments,2130 P St NW,DC,20037.0,2443.0,0,1.0,470,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio 4,Unit 01026,Apr. 15
3970,Westpark Apartments,2130 P St NW,DC,20037.0,2310.0,0,1.0,450,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio,Unit 00217,Now
3971,Westpark Apartments,2130 P St NW,DC,20037.0,2304.0,0,1.0,450,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio,Unit 00821,Mar. 15
3972,Westpark Apartments,2130 P St NW,DC,20037.0,2413.0,0,1.0,450,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",Studio,Unit 01021,Mar. 17
3973,Westpark Apartments,2130 P St NW,DC,20037.0,2803.0,1,1.0,670,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",1 Bed 1 Bath _670A,Unit 00820,Now
3974,Westpark Apartments,2130 P St NW,DC,20037.0,2856.0,1,1.0,670,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",1 Bed 1 Bath _670A,Unit 00520,Now
3975,Westpark Apartments,2130 P St NW,DC,20037.0,2926.0,1,1.0,670,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",1 Bed 1 Bath _670A,Unit 00920,Now
3976,Westpark Apartments,2130 P St NW,DC,20037.0,2859.0,1,1.0,670,Dupont Circle,98.0,...,95.0,"Come home to Westpark Apartments, where timele...",4.0,1975.0,50.0,600.0,"[{'title': 'Surface Lot', 'cost': '$275', 'des...",1 Bed 1 Bath _670A,Unit 01012,Mar. 16


In [46]:
cost = "$125 - $150".split("-")[-1]
int(re.sub('\D', '', str(cost)))

150

In [56]:
p = data["parking"].apply(extract_parking_stats).pipe(pd.json_normalize)
p[p["cost"] > 500]

Unnamed: 0,has_parking,has_garage_parking,cost


In [55]:
data["parking"].loc[6330]

[{'title': 'Garage',
  'cost': '$400 - $500',
  'description': '1 space, Assigned Parking'}]

In [15]:
x = raw_data.drop(columns='parking').drop_duplicates()
x[x.duplicated(subset=['property_name', 'address', 'state', 'model', 'unit'], keep=False)]

Unnamed: 0,property_name,address,state,zip_code,price,bedrooms,bathrooms,square_feet,neighborhood,walk_score,transit_score,bike_score,description,rating,build_year,application_fee,admin_fee,model,unit,available
2236,South Port,6112 Edsall Rd,VA,22304,1802.0,1,1.0,717,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,717sq ft,Unit 104,Now
2237,South Port,6112 Edsall Rd,VA,22304,1462.0,1,1.0,717,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,717sq ft,Unit 104,Mar. 25
2241,South Port,6112 Edsall Rd,VA,22304,1557.0,1,1.0,889,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,889sq ft,Unit 303,Apr. 1
2242,South Port,6112 Edsall Rd,VA,22304,1557.0,1,1.0,889,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,889sq ft,Unit 303,Apr. 11
2249,South Port,6112 Edsall Rd,VA,22304,2299.0,3,2.0,1157,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,1157sq ft,Unit 102,Now
2250,South Port,6112 Edsall Rd,VA,22304,2599.0,3,2.0,1157,Landmark/Van Dorn,70,58,28,South Port is nestled on 22 acres near the cor...,3.9,1966,50.0,150.0,1157sq ft,Unit 102,Now
5631,Braddock Lee Apartments,2423 Menokin Dr,VA,22302,1475.0,1,1.0,660,Seminary Hill,52,58,61,"Located in Alexandria, Virginia, Braddock Lee ...",3.8,1955,50.0,250.0,Arbor,Unit 201,Now
5632,Braddock Lee Apartments,2423 Menokin Dr,VA,22302,1560.0,1,1.0,660,Seminary Hill,52,58,61,"Located in Alexandria, Virginia, Braddock Lee ...",3.8,1955,50.0,250.0,Arbor,Unit 201,Now
5633,Braddock Lee Apartments,2423 Menokin Dr,VA,22302,1565.0,1,1.0,660,Seminary Hill,52,58,61,"Located in Alexandria, Virginia, Braddock Lee ...",3.8,1955,50.0,250.0,Arbor,Unit 101,Now
5634,Braddock Lee Apartments,2423 Menokin Dr,VA,22302,1525.0,1,1.0,660,Seminary Hill,52,58,61,"Located in Alexandria, Virginia, Braddock Lee ...",3.8,1955,50.0,250.0,Arbor,Unit 201,Mar. 13
