# Readme
Code to
* remove nulls
* normalize
* Manually calc eucl. distance using scikit-learn

In [92]:
import pandas as pd
from scipy.spatial import distance

# Functions

In [55]:
def get_df_missing_values(df):
    missing_value_df = pd.DataFrame({
        'column_name': dc_listings.columns
        , 'percent_missing': round(((df.isnull().sum())/df.shape[0])*100, 2)
    })

    missing_value_df.sort_values(by=['percent_missing'], inplace=True)

    return missing_value_df

# Get the data

In [45]:
dc_listings = pd.read_csv('../data/dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

# Remove unnecessary/unuseful columns

We do not need

* non-numeric
* non-ordinal
* null

In [46]:
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,160.0,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,350.0,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,50.0,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,95.0,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,50.0,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


In [47]:
dc_listings.drop(columns=[
    'room_type'
    , 'host_response_rate'
    , 'host_acceptance_rate'
    , 'host_listings_count'
    , 'latitude'
    , 'longitude'
    , 'city'
    , 'zipcode'
    , 'state'
    
], inplace=True)

dc_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews
0,4,1.0,1.0,2.0,160.0,$115.00,$100.00,1,1125,0
1,6,3.0,3.0,3.0,350.0,$100.00,,2,30,65
2,1,1.0,2.0,1.0,50.0,,,2,1125,1
3,2,1.0,1.0,1.0,95.0,,,1,1125,0
4,4,1.0,1.0,1.0,50.0,$15.00,$450.00,7,1125,0


## Remove nulls
* beds, bedrooms and bathrooms have  < 1 % missing rows --> remove rows
* cleaning_fee + security_deposit have too many missing rows be be deleted --> drop columns

In [56]:
missing_values = get_df_missing_values(dc_listings)

missing_values

Unnamed: 0,column_name,percent_missing
accommodates,accommodates,0.0
price,price,0.0
minimum_nights,minimum_nights,0.0
maximum_nights,maximum_nights,0.0
number_of_reviews,number_of_reviews,0.0
beds,beds,0.3
bedrooms,bedrooms,0.56
bathrooms,bathrooms,0.73


In [61]:
# drop columns
dc_listings.drop(columns = ['cleaning_fee', 'security_deposit'], inplace=True, errors='ignore')

dc_listings.dropna(subset = ['beds', 'bedrooms', 'bathrooms'], inplace=True, axis=0)

missing_values = get_df_missing_values(dc_listings)

missing_values

Unnamed: 0,column_name,percent_missing
accommodates,accommodates,0.0
bedrooms,bedrooms,0.0
bathrooms,bathrooms,0.0
beds,beds,0.0
price,price,0.0
minimum_nights,minimum_nights,0.0
maximum_nights,maximum_nights,0.0
number_of_reviews,number_of_reviews,0.0


# Normalize

In [6]:
dc_listings.describe()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude
count,3723.0,3723.0,3702.0,3696.0,3712.0,3723.0,3723.0,3723.0,3723.0,3723.0,3723.0
mean,13.517325,3.195004,1.210157,1.256358,1.643319,149.165995,2.250067,580306.9,15.306742,38.913967,-77.023294
std,64.534408,2.012216,0.839851,0.585539,1.182117,140.110699,3.622879,35195520.0,29.645586,0.021647,0.026951
min,1.0,1.0,0.0,0.0,1.0,10.0,1.0,1.0,0.0,38.825061,-77.110525
25%,1.0,2.0,1.0,1.0,1.0,85.0,1.0,120.0,1.0,38.901789,-77.039859
50%,1.0,2.0,1.0,1.0,1.0,115.0,2.0,1125.0,4.0,38.913375,-77.02641
75%,3.0,4.0,1.0,1.0,2.0,165.0,3.0,1125.0,16.0,38.926509,-77.002798
max,480.0,16.0,10.0,8.0,16.0,2822.0,180.0,2147484000.0,362.0,38.996382,-76.913137


In [64]:
normalized_listings = (dc_listings - dc_listings.mean())/dc_listings.std()

normalized_listings['price'] = dc_listings['price']

normalized_listings

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.401366,-0.249467,-0.439151,0.297345,160.0,-0.341375,-0.016573,-0.516709
1,1.399275,2.129218,2.969147,1.141549,350.0,-0.065038,-0.016603,1.706535
2,-1.095499,-0.249467,1.264998,-0.546858,50.0,-0.065038,-0.016573,-0.482505
3,-0.596544,-0.249467,-0.439151,-0.546858,95.0,-0.341375,-0.016573,-0.516709
4,0.401366,-0.249467,-0.439151,-0.546858,50.0,1.316644,-0.016573,-0.516709
...,...,...,...,...,...,...,...,...
3718,0.401366,-0.249467,-0.439151,0.297345,135.0,0.211298,-0.016603,0.133163
3719,-0.596544,-0.249467,1.264998,-0.546858,79.0,0.211298,-0.016594,0.714626
3720,1.399275,0.939875,-0.439151,1.141549,275.0,-0.065038,60.571616,-0.106264
3721,-0.596544,-0.249467,-0.439151,-0.546858,179.0,-0.065038,-0.016604,1.125071


In [66]:
normalized_listings.describe()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
count,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0
mean,-1.374245e-16,1.567801e-16,1.006489e-16,3.3872240000000006e-17,148.843639,-2.3226680000000003e-17,9.677782e-19,-1.1613340000000002e-17
std,1.0,1.0,1.0,1.0,137.550045,1.0,1.0,1.0
min,-1.095499,-1.43881,-2.143301,-0.5468581,10.0,-0.3413745,-0.01660422,-0.5167086
25%,-0.5965439,-0.2494671,-0.4391515,-0.5468581,85.0,-0.3413745,-0.01660086,-0.4825048
50%,-0.5965439,-0.2494671,-0.4391515,-0.5468581,115.0,-0.06503806,-0.0165725,-0.3798936
75%,0.4013657,-0.2494671,-0.4391515,0.2973454,165.0,0.2112984,-0.0165725,0.03055137
max,6.388823,10.45462,11.48989,12.11619,2822.0,49.12285,60.57162,11.86505


In [89]:
dc_listings.describe().loc['min','minimum_nights'] - dc_listings.describe().loc['max','minimum_nights']

-179.0

In [90]:
normalized_listings.describe().loc['min','minimum_nights'] - normalized_listings.describe().loc['max','minimum_nights']

-49.46422861750143

# Calc similarity

In [112]:
normalized_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.401366,-0.249467,-0.439151,0.297345,160.0,-0.341375,-0.016573,-0.516709
1,1.399275,2.129218,2.969147,1.141549,350.0,-0.065038,-0.016603,1.706535
2,-1.095499,-0.249467,1.264998,-0.546858,50.0,-0.065038,-0.016573,-0.482505
3,-0.596544,-0.249467,-0.439151,-0.546858,95.0,-0.341375,-0.016573,-0.516709
4,0.401366,-0.249467,-0.439151,-0.546858,50.0,1.316644,-0.016573,-0.516709


In [122]:
features = ['accommodates', 'bathrooms']
first_fifth_distance = distance.euclidean(normalized_listings.loc[0, features], normalized_listings.loc[4, features])
print(first_fifth_distance)

0.0


# Save the normalized data

In [126]:
normalized_listings.to_csv('../data/normalized_listings.csv')