# Clean rent data #

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
pd.set_option('display.max_columns', None)

In [2]:
rent_all_columns = pd.read_csv('rent_data_all.csv')

In [3]:
rent_all_columns.head(2)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,livingArea,longitude,price,priceForHDP,shouldHighlight,state,streetAddress,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,
1,1,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,SINGLE_FAMILY,False,True,False,False,False,False,False,42.095623,1550.0,-72.49687,750.0,750.0,False,MA,42 Catalina Dr,,1128,56205755,acres,0.579982,2400.0,212200.0,304700.0,,,


### Renaming columns ###

In [4]:
rent_all_columns=rent_all_columns.rename(columns={'livingArea':'area', 'price':'rent', 'streetAddress':'street_address'})


### Adding a 0 prefix to all zipcodes ###

In [5]:
rent_all_columns['zipcode']=rent_all_columns['zipcode'].apply(lambda x: '{0:0>5}'.format(x))

In [6]:
rent_all_columns.head(1)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,


## List of Springfield, MA neighborhoods with zipcodes ##
### May be able to make this into a dictionary and use 'clean swiss data' function in notebook 2 to assign neighborhood names to zip codes ###

### *combined a few of the zip codes that had mulitple neighborhoods associated with them ###
 
    Atwater Park, 01107
    Downtown, 01103, 01105
    East Forest Park, 01118
    Forest Park, 01108
    Indian Orchard, 01151
    Liberty Heights & East Springfield, 01104
    Maple Hill/Ridgewood, 01105
    McKnight, 01109
    Pine Point/Boston Road, 01109, 01119
    Sixteen Acres, 01118, 01119, 01128, 01129


In [7]:
hood_dict = {'01107':'Atwater Park',
             '01103':'Downtown',
             '01118':'East Forest Park*',
             '01108':'Forest Park',
             '01151':'Indian Orchard',
             '01104':'Liberty Heights & East Springfield;',
             '01105':'Maple Hill/Ridgewood*',
             '01109':'McKnight*',
             '01119':'Pine Point/Boston Road*',
             '01128':'Sixteen Acres',
             '01129':'SixteenAcres',
            }

In [8]:
def create_new_features(df):
    # Add rent categories
    rent_all_columns['sqft_cost'] =  rent_all_columns["rent"] /  rent_all_columns["area"]
    descr_rents =  rent_all_columns['sqft_cost'].describe()

    quantiles =  rent_all_columns['sqft_cost'].quantile(q=[0.15, 0.5, 0.85])
    cheap = quantiles[0.15]
    average = quantiles[0.5]
    expensive = quantiles[0.85]

    rent_all_columns["cost_per_square_feet"] = np.where(
         rent_all_columns['sqft_cost'] < cheap,
        0,
        np.where(
            ( rent_all_columns['sqft_cost'] >= cheap) & ( rent_all_columns['sqft_cost'] < average),
            1,
            np.where(
                ( rent_all_columns['sqft_cost'] >= average) & ( rent_all_columns['sqft_cost'] < expensive),
                2,
                3,
            ),
        ),
    )
    rent_all_columns["hover_strings_scatter"] = [
    f"Address: {street}, {place},<br>Rooms: {rooms}, <br>Rent: USD {rent}"
    for street, place, rooms, rent in zip(
        rent_all_columns["street_address"],
        rent_all_columns["city"],
        rent_all_columns["bedrooms"],
        rent_all_columns["rent"],
        )
    ]
    return  rent_all_columns
rent_all_columns.head(1)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,


## Adding 'sqft_cost' column ##

In [9]:
rent_all_columns['sqft_cost'] =  rent_all_columns["rent"] /  rent_all_columns["area"]
descr_rents =  rent_all_columns['sqft_cost'].describe()
rent_all_columns.head(1)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction,sqft_cost
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,,1.688817


## Adding 'neighborhood' column ##

In [15]:
rent_all_columns['neighborhood'] = rent_all_columns['zipcode'].apply(lambda x: hood_dict.get(x))
rent_all_columns.head(5)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction,sqft_cost,neighborhood
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,,1.688817,Downtown
1,1,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,SINGLE_FAMILY,False,True,False,False,False,False,False,42.095623,1550.0,-72.49687,750.0,750.0,False,MA,42 Catalina Dr,,1128,56205755,acres,0.579982,2400.0,212200.0,304700.0,,,,0.483871,Sixteen Acres
2,2,1.5,3.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,TOWNHOUSE,False,True,False,False,False,False,False,42.143475,1200.0,-72.48452,1650.0,1650.0,False,MA,62 Biddle St #62,# 62,1129,2061445071,,,,,,,,,1.375,SixteenAcres
3,3,2.0,3.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,TOWNHOUSE,False,True,False,False,False,False,False,42.098564,1400.0,-72.57001,2300.0,2300.0,False,MA,13 Beech St,,1105,56202635,sqft,7405.0,2188.0,19000.0,234800.0,,,,1.642857,Maple Hill/Ridgewood*
4,4,1.0,2.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,SINGLE_FAMILY,False,True,False,False,False,False,False,42.140114,864.0,-72.566666,1700.0,1700.0,False,MA,38 Lang St,,1104,56217208,sqft,5662.0,1500.0,152200.0,213900.0,,,,1.967593,Liberty Heights & East Springfield;


## Adding 'hover_strings_scatter' column ##

In [17]:
rent_all_columns["hover_strings_scatter"] = [
    f"Address: {street}, {place},<br>Rooms: {rooms}, <br>Rent: USD {rent}"
    for street, place, rooms, rent in zip(
        rent_all_columns["street_address"],
        rent_all_columns["city"],
        rent_all_columns["bedrooms"],
        rent_all_columns["rent"],
        )
    ]
rent_all_columns.head(1)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction,sqft_cost,neighborhood,hover_strings_scatter
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,,1.688817,Downtown,"Address: 193 Worthington St #71DFDFD55, Spring..."


## Adding 'quantile calculations' ##

In [18]:
quantiles =  rent_all_columns['sqft_cost'].quantile(q=[0.15, 0.5, 0.85])
cheap = quantiles[0.15]
average = quantiles[0.5]
expensive = quantiles[0.85]

rent_all_columns["cost_per_square_feet"] = np.where(rent_all_columns['sqft_cost'] < cheap, 0,
        np.where(( rent_all_columns['sqft_cost'] >= cheap) & ( rent_all_columns['sqft_cost'] < average), 1,
        np.where(( rent_all_columns['sqft_cost'] >= average) & ( rent_all_columns['sqft_cost'] < expensive), 2, 3,
            ),
        ),
    )
rent_all_columns.head(1)

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isRentalWithBasePrice,isUnmappable,isZillowOwned,latitude,area,longitude,rent,priceForHDP,shouldHighlight,state,street_address,unit,zipcode,zpid,lotAreaUnit,lotAreaValue,rentZestimate,taxAssessedValue,zestimate,datePriceChanged,priceChange,priceReduction,sqft_cost,neighborhood,hover_strings_scatter,cost_per_square_feet
0,0,1.0,1.0,Springfield,USA,USD,-1,FOR_RENT,FOR_RENT,APARTMENT,True,True,False,False,False,False,False,42.104,617.0,-72.592384,1042.0,1042.0,False,MA,193 Worthington St #71DFDFD55,# 71dfdfd55,1103,2066685043,,,,,,,,,1.688817,Downtown,"Address: 193 Worthington St #71DFDFD55, Spring...",3


## Getting only wanted columns ##

In [29]:
rent_data = rent_all_columns[['bedrooms', 'bathrooms', 'area', 'rent', 'street_address', 'zipcode', 'neighborhood', 'city', 'sqft_cost','hover_strings_scatter','cost_per_square_feet']]
rent_all_columns.fillna('NA')
rent_data.head(10)

Unnamed: 0,bedrooms,bathrooms,area,rent,street_address,zipcode,neighborhood,city,sqft_cost,hover_strings_scatter,cost_per_square_feet
0,1.0,1.0,617.0,1042.0,193 Worthington St #71DFDFD55,1103,Downtown,Springfield,1.688817,"Address: 193 Worthington St #71DFDFD55, Spring...",3
1,1.0,1.0,1550.0,750.0,42 Catalina Dr,1128,Sixteen Acres,Springfield,0.483871,"Address: 42 Catalina Dr, Springfield,<br>Rooms...",0
2,3.0,1.5,1200.0,1650.0,62 Biddle St #62,1129,SixteenAcres,Springfield,1.375,"Address: 62 Biddle St #62, Springfield,<br>Roo...",1
3,3.0,2.0,1400.0,2300.0,13 Beech St,1105,Maple Hill/Ridgewood*,Springfield,1.642857,"Address: 13 Beech St, Springfield,<br>Rooms: 3...",2
4,2.0,1.0,864.0,1700.0,38 Lang St,1104,Liberty Heights & East Springfield;,Springfield,1.967593,"Address: 38 Lang St, Springfield,<br>Rooms: 2....",3
5,3.0,1.5,2300.0,3000.0,(undisclosed Address),1107,Atwater Park,Springfield,1.304348,"Address: (undisclosed Address), Springfield,<b...",1
6,4.0,1.0,,1800.0,12 Wareham St,1108,Forest Park,Springfield,,"Address: 12 Wareham St, Springfield,<br>Rooms:...",3
7,2.0,1.5,1150.0,1750.0,184 Draper St,1108,Forest Park,Springfield,1.521739,"Address: 184 Draper St, Springfield,<br>Rooms:...",2
8,4.0,3.0,1560.0,2900.0,111 Lumae St,1119,Pine Point/Boston Road*,Springfield,1.858974,"Address: 111 Lumae St, Springfield,<br>Rooms: ...",3
9,2.0,1.0,882.0,1450.0,25 Ruskin St #25,1108,Forest Park,Springfield,1.643991,"Address: 25 Ruskin St #25, Springfield,<br>Roo...",2


## *Need to add remaing 'create new features' function from notebook 2* ##

def create_new_features(df):
    # Add rent categories
    rent_all_columns['sqft_cost'] =  rent_all_columns["rent"] /  rent_all_columns["area"]
    descr_rents =  rent_all_columns['sqft_cost'].describe()

    quantiles =  rent_all_columns['sqft_cost'].quantile(q=[0.15, 0.5, 0.85])
    cheap = quantiles[0.15]
    average = quantiles[0.5]
    expensive = quantiles[0.85]

    rent_all_columns["cost_per_square_feet"] = np.where(
         rent_all_columns['sqft_cost'] < cheap,
        0,
        np.where(
            ( rent_all_columns['sqft_cost'] >= cheap) & ( rent_all_columns['sqft_cost'] < average),
            1,
            np.where(
                ( rent_all_columns['sqft_cost'] >= average) & ( rent_all_columns['sqft_cost'] < expensive),
                2,
                3,
            ),
        ),
    )
    rent_all_columns["hover_strings_scatter"] = [
    f"Address: {street}, {place},<br>Rooms: {rooms}, <br>Rent: USD {rent}"
    for street, place, rooms, rent in zip(
        rent_all_columns["street_address"],
        rent_all_columns["city"],
        rent_all_columns["bedrooms"],
        rent_all_columns["rent"],
        )
    ]
    return  rent_all_columns

## ***Need to re-add 'area' to function once NaN values are taken care of*** ##

In [23]:
 # Hover strings
#rent_all_columns["hover_strings_scatter"] = [
#    f"Address: {street}, {place},<br>Rooms: {rooms}, Floor Space: {round(size)}m²,<br>Rent: USD {rent}"
#    for street, place, rooms, size, rent in zip(
#        rent_all_columns["street_address"],
#        rent_all_columns["city"],
#        rent_all_columns["bedrooms"],
#        rent_all_columns["area"],
#        rent_all_columns["rent"],
#    )
#]

rent_all_columns["hover_strings_scatter"] = [
    f"Address: {street}, {place},<br>Rooms: {rooms}, Floor Space: {round(size)}m², <br>Rent: USD {rent}"
    for street, place, rooms, rent in zip(
        rent_all_columns["street_address"],
        rent_all_columns["city"],
        rent_all_columns["bedrooms"],
        rent_all_columns["area"],
        rent_all_columns["rent"],
    )
]

#print(rent_all_columns.fillna('NA'))
rent_data.head(10)

ValueError: too many values to unpack (expected 4)

In [26]:
rent_all_columns["hover_strings_scatter"] = [
    f"Address: {street}, {place},<br>Rooms: {rooms}, Floor Space: {round(size)}m², <br>Rent: USD {rent}"
    for street, place, rooms, rent in zip(
        rent_all_columns["street_address"],
        rent_all_columns["city"],
        rent_all_columns["bedrooms"],
        rent_all_columns["area"],
        rent_all_columns["rent"],
    )
]



ValueError: too many values to unpack (expected 4)

# Rent Data all clean and making csv file and getting other information #

In [None]:
rent_data.head(2)

In [None]:
rent_data.to_csv('rent_data_clean_spfld.csv', encoding='utf-8')

In [None]:
rent_data.info()

In [None]:
rent_data.describe()

In [None]:
rent_data.value_counts()