In [None]:
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime, date
import matplotlib.pyplot as plt
from matplotlib import font_manager

def prepare_data(name):
    listings = pd.read_excel(f'{name}.xlsx')

    features = """
    accomodates bathrooms bedrooms beds nightly_price number_of_reviews id
    """.strip().split(' ')

    listings = listings[features]

    # shuffle listings
    listings = listings.sample(frac=1,random_state=0)

    for col in listings:
        if not (col == 'nightly_price' or col == 'id'):
            listings[col] = scale(listings[col])
    
    return listings

def predict(data):
    size_test = int(len(data) * 0.25)
    
    train_df = data.copy().iloc[:size_test]
    test_df = data.copy().iloc[size_test:]
 
    knn = KNeighborsRegressor(algorithm='brute', n_neighbors=5)

    cols = ['accomodates', 'bathrooms', 'bedrooms', 'beds']

    knn.fit(train_df[cols], train_df['nightly_price'])
    predictions = knn.predict(test_df[cols])
    mse = mean_squared_error(test_df['nightly_price'], predictions)
    rmse = mse ** (1/2)
    
    test_df['expected'] = [round(x / 1000) * 1000 for x in predictions]
    test_df['diff'] = test_df['expected'] - test_df['nightly_price']
    
    return test_df

def draw_graph(result, how_many):
    CHART_SIZE_X = 20
    CHART_SIZE_Y = 20
    
    plt.figure(figsize=(CHART_SIZE_X, CHART_SIZE_Y))
    
    displayed = result[:how_many]
    range_displayed = range(len(result[:how_many]))
    
    font_fname = '/Library/Fonts/NanumSquareRoundOTFB.otf'
    font_family = font_manager.FontProperties(fname=font_fname).get_name()
    
    plt.rcParams['font.family'] = font_family
    
    plt.bar(range_displayed, displayed['expected'])
    plt.bar(range_displayed, displayed['nightly_price'])
    plt.xticks(range_displayed, displayed['id'])

    for i, v in enumerate(list(displayed['expected'])):
        expected = list(displayed['expected'])[i]
        actual = list(displayed['nightly_price'])[i]
        diff = int(abs(expected - actual))
        
        position = max(expected, actual) * 1.01
        
        plt.text(i, position, str(f'차이: {diff}'), color='black', fontweight='bold', horizontalalignment='center')
    
    plt.ylabel('Price')
    plt.xlabel('숙소')
    plt.title('Airbnb Price Comparison')
    plt.legend(['expected', 'actual'])
    plt.autoscale(enable=True, axis='x', tight=True)    
    
main():  
    while True:
        listing_files = {
            '춘천': 'listing-춘천-2018-12-03 20-15', 
            '포항': 'listing-포항-2018-11-27 17-13',
            '홍대': 'listing-홍대-2018-12-05 19-12',
            '수원': 'listing-수원-2018-12-05 19-36',
            '강남': 'listing-강남-2018-12-05 19-50',
        }
            
        city = input(f'{", ".join(listing_files.keys())} {len(listing_files.keys())} 지역의 데이터가 있습니다. 어떤 지역을 고르시겠습니까?')
        how_many = 10
        
        try:
            how_many = int(input('얼마나 많은 숙소의 가격 정보를 비교하시겠습니까?'))
        except ValueError:
            print('잘못된 숫자입니다. 기본값인 10개를 보여줍니다.')

        if city in listing_files.keys():
            listings = prepare_data(listing_files[city]) 
            result = predict(listings) 
            draw_graph(result, how_many)
            break

In [None]:
import pandas as pd

dc_listings = pd.read_csv('dc_airbnb.csv')
print(dc_listings.shape)

dc_listings.head()