In [None]:
import numpy as np 
import pandas as pd
from IPython.display import display

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
raw_rental_data = pd.read_csv(r'housing_train.csv')

In [None]:
raw_rental_data.head()

In [None]:
#rental_data_ver1 does not have columns: 'url', 'region_url' and 'description' 
rental_data_ver1 = raw_rental_data.drop(columns=['url', 'region_url', 'description'])

In [None]:
for column in rental_data_ver1.columns:
    print(f'\n************')
    print(f'\nValues by attribute {column}: ')
    print(rental_data_ver1[column].value_counts())
    print(f'\nValues in {column} that are NaN:')
    print(rental_data_ver1[column].isna().sum())

## Demand

Slashing the columns that have rubbish in demand

In [None]:
demand_values = rental_data_ver1['demand'].value_counts()
print(f"Values by demand.")
print(demand_values)


Remove instances with wrong values.

In [None]:
rubish_val = rental_data_ver1['demand'].str.startswith('https')
rental_data_ver1 = rental_data_ver1[~rubish_val]

In [None]:
clean_demand_values = rental_data_ver1['demand'].value_counts()
print(f"Values by demand.")
print(demand_values)

Visualize the typos in demand

In [None]:
typos = (rental_data_ver1['demand'] == 'yes' ) | (rental_data_ver1['demand'] == 'no')

In [None]:
demand_typos = rental_data_ver1[~typos]['demand']
print(demand_typos)

We realized all typos should be no, so we change it to 'no'

In [None]:
rental_data_ver1['demand'] = rental_data_ver1['demand'].replace(demand_typos.values, 'no') 

In [None]:
#checking that there are not typos anymore
rental_data_ver1[~((rental_data_ver1['demand'] == 'yes' ) | (rental_data_ver1['demand'] == 'no'))]

## Region

Displaying values by attribute 'region'

In [None]:

display(rental_data_ver1['region'].value_counts())

We found that there are reasonable values here, so no modification is done

## Rent

Displaying values by attribute 'rent'

In [None]:
rent_summary = rental_data_ver1['rent'].describe()
display(rent_summary)

By the description, we can see that there are outliers, like the max values having the range of millions, nad the min of zero

In [None]:
print("Low values")
display(rental_data_ver1['rent'].value_counts().sort_index().head())
print("HIgh values")
display(rental_data_ver1['rent'].value_counts().sort_index().tail(10))

If we see the details of the locations that are greateer than 10000, we see that two of them are not consitent with the data, they are two small to have such a price.
Therefore, ntentionally we will remove the two highest values, which we for sure know they are outliers.

In [None]:
rental_data_ver1[(rental_data_ver1['rent'] >= 10000)]

In [None]:
rental_data_ver1 = rental_data_ver1[~(rental_data_ver1['rent'] >= 90000)]

We wil use  the theory to find the outliers with the IQR.

In [None]:
rental_data_ver1.head()

In [None]:
rent_summary = rental_data_ver1['rent'].describe()
display(rent_summary)

rent_q1 = rent_summary.loc['25%']
rent_q3 = rent_summary.loc['75%']
rent_iqr = rent_q3 - rent_q1 

lower_rent_limit = rent_q1 - 1.5 * rent_iqr
upper_rent_limit = rent_q3 + 2.5 * rent_iqr
print(f'lower limit {lower_rent_limit} and upper {upper_rent_limit}')

Looking at the result, we verify that the upper value is in a range that should no be consider an  outlier by the business context, and the same for the lower values, which should not be lowere than 0. This could be because of the existence of too many outlier, that sare not longer conisder outliers.

## Type of apartment

Displaying values by attribute 'type'

In [None]:
display(rental_data_ver1['type'].value_counts())

In [None]:
type_typos = rental_data_ver1[(rental_data_ver1['type'] == 'land') | (rental_data_ver1['type'] == 'flat')]
display(type_typos)

In [None]:
rental_data_ver1['type'] = rental_data_ver1['type'].replace(['flat', 'land'], 'apartment') 

In [None]:
display(rental_data_ver1['type'].value_counts())

## Square Feet

In [None]:
area_summary = rental_data_ver1['sqfeet'].describe()
display(area_summary)

In [None]:
rental_data_ver1['sqfeet'].value_counts().sort_index().tail(10)

In [None]:
rental_data_ver1[rental_data_ver1['sqfeet']> 7000]

In [None]:
rental_data_ver1 = rental_data_ver1[~(rental_data_ver1['sqfeet'] >= 70000)]


## bedrooms

In [None]:
bedrooms_summary = rental_data_ver1['bedrooms'].describe()
display(bedrooms_summary)
display(rental_data_ver1['bedrooms'].value_counts())

Everything seems to check out

## bathrooms

In [None]:
baths_summary = rental_data_ver1['bathrooms'].describe()
display(baths_summary)
display(rental_data_ver1['bathrooms'].value_counts())

Everything seems to check out

## Binary values

In [None]:
for cat in ['cats_allowed', 'dogs_allowed', "smoking_allowed", "wheelchair_access" ,"electric_vehicle_charge", 	"comes_furnished"]:
    display(rental_data_ver1[cat].value_counts())

Everything seems to check out

## laundry options

In [None]:
display(rental_data_ver1["laundry_options"].value_counts())

## parking_options

In [None]:
display(rental_data_ver1["parking_options"].value_counts())

## latitude 	long

In [None]:
rental_data_ver1["latitude"].describe()

Looking for places outside USA

In [None]:
rental_data_ver1[(rental_data_ver1["latitude"] < 24) | (rental_data_ver1["latitude"] > 68) ]

In [None]:
rental_data_ver1[(rental_data_ver1["latitude"] < 24) | (rental_data_ver1["latitude"] > 68) ]

In [44]:
rental_data_ver1["long"].describe()

count    35508.000000
mean      -105.447882
std         23.526905
min       -159.496000
25%       -119.017000
50%       -111.578000
75%        -90.696500
max        172.633000
Name: long, dtype: float64

In [47]:
rental_data_ver1[(rental_data_ver1["long"] > -66) | (rental_data_ver1["long"] < -172) ].head()

Unnamed: 0,id,region,rent,type,sqfeet,bedrooms,bathrooms,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,demand,latitude,long,state
13406,7035126252,fayetteville,720,apartment,544,1,1.0,1,1,1,0,0,0,w/d in unit,off-street parking,yes,36.22,94.1248,ar
13443,7035129783,fayetteville,770,apartment,669,1,1.0,1,1,1,0,0,0,w/d in unit,off-street parking,yes,36.22,94.1248,ar
13457,7035133685,fayetteville,815,apartment,889,2,1.0,1,1,1,0,0,0,w/d in unit,off-street parking,yes,36.22,94.1248,ar
14172,7048689247,fayetteville,855,apartment,960,2,2.0,1,1,1,0,0,0,w/d in unit,,yes,36.22,94.1248,ar
14176,7043984781,fayetteville,800,apartment,960,2,2.0,1,1,1,0,0,0,w/d in unit,,yes,36.22,94.1248,ar
