### Data Processing

In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Import NY later
listings = pd.read_csv('Data/New_York_City.csv')

In [None]:
# Add city column
listings['city'] = 'New York City'
listings.tail()

In [None]:
# See all columns in merged dataset
listings.columns

In [None]:
# Identify columns with null values exceeding 1500
null_check = listings.isna().sum()
remove_column_indices = []
remove_columns = []
[remove_columns.append(null_check.index[i]) for i in range(len(null_check)) if null_check[i] > 1500]
remove_columns

In [None]:
# Check columns with most null values
null_check.sort_values(ascending=False).head(20)

In [None]:
# Drop all rows with null values
# listings.dropna(inplace=True)

In [None]:
# Convert t/f to 1/0
listings['host_is_superhost'] = listings['host_is_superhost']\
                                .apply(lambda value: 1 if value == 't' else 0)

listings['instant_bookable'] = listings['instant_bookable']\
                                .apply(lambda value: 1 if value == 't' else 0)
listings.head()

In [None]:
# Convert price variable to float
listings['price_int'] = listings['price'].str.replace('$', '')
listings['price_int'] = listings['price_int'].str.replace(',', '')
listings['price_int'] = listings['price_int'].astype(float)
listings['price_int'].dtype

In [None]:
# Convert host acceptance rate from string to float
listings['host_acceptance_rate'] = listings['host_acceptance_rate']\
                                .str.replace('%', '').astype(float)

listings['host_acceptance_rate'].dtype

In [None]:
# Distinguish between long-term and short-term rentals
listings.loc[listings['minimum_nights'] < 30, 'short-term'] = 1
listings.loc[listings['minimum_nights'] >= 30, 'short-term'] = 0

In [None]:
listings.columns

In [None]:
model_data = pd.DataFrame(listings[['host_acceptance_rate',
       'host_is_superhost', 'host_listings_count', 'short-term',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'number_of_reviews', 'neighbourhood_group_cleansed',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms',
       'city', 'price_int']])

In [None]:
# Create bins for the number of reviews that a listing has
model_data.loc[model_data['number_of_reviews'] == 0, 'review_count_bin'] = 'No reviews'
model_data.loc[(model_data['number_of_reviews'] > 0) \
               & (model_data['number_of_reviews'] < 10), 'review_count_bin'] = 'Less than 10'
model_data.loc[(model_data['number_of_reviews'] >= 10) \
               & (model_data['number_of_reviews'] < 50), 'review_count_bin'] = 'Between 10 and 50'
model_data.loc[(model_data['number_of_reviews'] >= 50) \
               & (model_data['number_of_reviews'] < 200), 'review_count_bin'] = 'Between 50 and 200'
model_data.loc[model_data['number_of_reviews'] >= 200, 'review_count_bin'] = '200 or greater'

In [None]:
# Checks whether listing has A/C or a washer/dryer and adds dummy columns

# Initialize empty lists
air_conditioning = []
washer_dryer = []
num_amenities = []

# Loops through each listing's amenities to look for air conditioning or washer/dryer
listings_amenities = list(model_data['amenities'])
for amenities in listings_amenities:
    amenities = amenities.replace('"','')
    amenities_list = amenities.split(', ')
    air_check = [1 if item == 'Air conditioning' else 0 for item in amenities_list]
    air_conditioning.append(max(air_check))
    laundry_check = [1 if (item == 'Washer' or item == 'Dryer') else 0 for item in amenities_list]
    washer_dryer.append(max(laundry_check))
    num_amenities.append(len(amenities_list))

# Creates dataframe columns
model_data['air_conditioning'] = pd.Series(air_conditioning)
model_data['washer_dryer'] = pd.Series(washer_dryer)
model_data['num_amenities'] = pd.Series(num_amenities)

In [None]:
model_data['bedrooms_per_accomodates'] = model_data['bedrooms'] / model_data['accommodates']

In [None]:
# Summarize model data
# model_data.dropna(inplace=True)
model_data.describe()

In [None]:
# listings.to_csv('Data/Consolidated_Data.csv')

### Data Investigation and Outlier Identification

In [None]:
# Check average price and count of listings by neighborhood
avg_price_by_neighborhood = listings.groupby(by=['neighbourhood_cleansed'])['price_int']\
                    .agg(['mean','count'])
avg_price_by_neighborhood = pd.DataFrame(avg_price_by_neighborhood)
avg_price_by_neighborhood.sort_values(['count'],ascending=True).head(20)

In [None]:
# Find neighborhoods with greater than 5 listings
neighborhoods_retain = avg_price_by_neighborhood.loc[avg_price_by_neighborhood['count'] > 5]

In [None]:
# Check distribution of number of reviews
model_data.groupby(by=['review_count_bin'])['price_int']\
                    .agg(['mean','count'])

In [None]:
# Assemble boxplot data for plotting prices per burrough
burroughs = list(listings['neighbourhood_group_cleansed'].unique())

boxplot_data = []
for burrough in burroughs:
#     burrough_data = listings.loc[listings['neighbourhood_group_cleansed'] == burrough]
    burrough_data = model_data.loc[model_data['neighbourhood_group_cleansed'] == burrough]
    boxplot_data.append(burrough_data['price_int'])
    
# Price by burrough
plt.boxplot(boxplot_data,labels=burroughs);
plt.ylabel('Price');
plt.title('Listing Price by Borough');

In [None]:
# Determine max price by burrough and calculate 50% of max price
max_price_by_burrough = pd.DataFrame(listings.groupby(by='neighbourhood_group_cleansed',
                                                     as_index = False)
                                     ['price_int'].max())
max_price_by_burrough['price_cutoff'] = max_price_by_burrough['price_int'] / 2
max_price_by_burrough.rename(columns={'price_int' : 'max_burrough_price'},inplace=True)

In [None]:
# Check average price and count of listings by neighborhood
price_by_type = listings.groupby(by=['property_type'])['price_int']\
                    .agg(['mean','count'])
price_by_type = pd.DataFrame(price_by_type)
price_by_type.sort_values(['count'],ascending=True).head(40)
drop_property_types = price_by_type.loc[price_by_type['count'] <= 5].index
list(drop_property_types)

In [None]:
# Check average price and count of listings by neighborhood
price_by_rooms = listings.groupby(by=['bedrooms'])['price_int']\
                    .agg(['mean','count'])
price_by_rooms = pd.DataFrame(price_by_rooms)
price_by_rooms.sort_values(['count'],ascending=True).head()

In [None]:
plt.scatter(x=listings['number_of_reviews'],y=listings['price_int'],c=listings['greater_than_20']);
plt.xlabel('Number of Reviews');
plt.ylabel('Price');

### Outlier Removal

In [None]:
# Remove listings with more than 10 rooms
model_data = model_data.loc[model_data['bedrooms'] <= 10]

# Remove listings with a minimum stay greater than a year
model_data = model_data.loc[model_data['minimum_nights'] < 365]

In [None]:
# Remove listings for property types with count less than 6
model_data = model_data[~model_data['property_type'].isin(drop_property_types)]

In [None]:
# Remove listings at a price that exceeds 50% of the max price for each burrough
model_data = pd.merge(model_data,max_price_by_burrough,how='left',on='neighbourhood_group_cleansed')
model_data = model_data.loc[model_data['price_int'] < model_data['price_cutoff']]
model_data.head()

In [None]:
# Limit to listings with at least 20 reviews
model_data = model_data.loc[model_data['number_of_reviews'] > 20]

### Modeling

#### Linear Model

In [None]:
# initialize empty list to store linear model results
results_tracker = []

def linear_model(data):
    # Removes any lingering null values - may need to rethink location of this
    data.dropna(inplace=True)
    # Assign X and y
    y = data['price_int']
    X = data.drop(columns='price_int').values

    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

    # Instantiate and train the model
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    reg.score(X_train,y_train)
    
    print(f'Train score:\t {reg.score(X_train,y_train)}')
    print(f'Test score:\t {reg.score(X_test,y_test)}')
    
    # Save the results into the results_tracker
    results = {'train score' : reg.score(X_train,y_train),
              'test score' : reg.score(X_test,y_test),
              'features' : list(data.columns)}
    results_tracker.append(results)

#### Run Linear Models

In [None]:
model_data_1 = model_data[['price_int','bedrooms','accommodates']]

In [None]:
linear_model(model_data_1)

In [None]:
model_data_2 = model_data[['price_int','accommodates','review_scores_rating']]

In [None]:
linear_model(model_data_2)

In [None]:
model_data_3 = model_data[['host_is_superhost','room_type', 'accommodates','bedrooms','number_of_reviews', 
                   'review_scores_rating','city', 'price_int']]
model_data_3 = pd.get_dummies(model_data_3)

In [None]:
linear_model(model_data_3)

In [None]:
model_data_4 = pd.get_dummies(model_data[['price_int','review_count_bin','review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin', 'number_of_reviews','host_acceptance_rate',
       'review_scores_communication', 'review_scores_location', 'instant_bookable',
       'review_scores_value','host_is_superhost','accommodates','bedrooms','review_scores_rating','room_type']])

In [None]:
linear_model(model_data_4)

In [None]:
model_data_5 = pd.get_dummies(model_data[['price_int','city','accommodates', 'bedrooms',
                                          'review_scores_rating','number_of_reviews',
                                          'host_is_superhost','instant_bookable','room_type']])

In [None]:
linear_model(model_data_5)

In [None]:
model_data_6 = pd.get_dummies(model_data[['price_int','neighbourhood_group_cleansed',
                                          'accommodates', 'bedrooms', 'air_conditioning',
                                          'review_scores_rating','review_count_bin',
                                          'host_is_superhost','host_identity_verified','room_type']])

In [None]:
linear_model(model_data_6)

In [None]:
model_data_7 = pd.get_dummies(model_data[['host_is_superhost','room_type', 'accommodates','bedrooms','number_of_reviews', 
                   'review_scores_rating','price_int','neighbourhood_group_cleansed']])

In [None]:
linear_model(model_data_7)

In [None]:
model_data_8 = pd.get_dummies(model_data[['price_int','neighbourhood_group_cleansed',
       'review_scores_cleanliness','host_acceptance_rate', 'review_scores_communication',
        'review_scores_location', 'instant_bookable','short-term', 'review_scores_checkin',
       'review_scores_value','accommodates','bedrooms','review_scores_rating','room_type']])

In [None]:
linear_model(model_data_8)

In [None]:
model_data_9 = pd.get_dummies(model_data[['price_int','neighbourhood_group_cleansed','review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin', 'review_count_bin','host_acceptance_rate',
       'review_scores_communication', 'review_scores_location', 'instant_bookable','short-term', 'air_conditioning',
       'review_scores_value','host_is_superhost','accommodates','bedrooms','review_scores_rating','room_type']])

In [None]:
linear_model(model_data_9)

In [None]:
model_data_10 = pd.get_dummies(model_data[['price_int', 'neighbourhood_group_cleansed',
                                           'review_count_bin', 'air_conditioning','accommodates',
                                           'instant_bookable', 'short-term', 'num_amenities',
                                           'host_identity_verified', 'bedrooms', 'bedrooms_per_accomodates',
                                           'room_type']])

In [None]:
linear_model(model_data_10)

In [None]:
model_data_11 = pd.get_dummies(model_data[['price_int','neighbourhood_group_cleansed',
       'review_scores_cleanliness', 'review_count_bin', 'air_conditioning',
        'review_scores_location', 'instant_bookable','short-term', 'accommodates',
       'review_scores_value','review_scores_rating','room_type']])

In [None]:
linear_model(model_data_11)

#### Review Results

In [None]:
# Remove duplicates from results_tracker and sort results in order of best training score
results_summary = []
[results_summary.append(x) for x in results_tracker if x not in results_summary]
results_summary.sort(key=lambda result: result['train score'],reverse=True)

# Print best result
print(f'Best training score: {results_summary[0]["train score"]}')
print(f'Number of features: {len(results_summary[0]["features"])}')
print(f'Best features: {results_summary[0]["features"]}')

In [None]:
print(results_summary[1]['features'])

In [None]:
model_data['accommodates'].unique().max()

#### Random Forest Regressor

In [None]:
rfr_results_tracker = []

def random_forests(data):
    # Removes any lingering null values - may need to rethink location of this
    data.dropna(inplace=True)
    # Assign X and y
    y = data['price_int']
    X = data.drop(columns='price_int').values

    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=9)

    # Instantiate and train the model
    rfr = RandomForestRegressor(random_state=9)
    rfr.fit(X_train,y_train)
    rfr.score(X_train,y_train)
    
    print(f'Train score:\t {rfr.score(X_train,y_train)}')
    print(f'Test score:\t {rfr.score(X_test,y_test)}')
    
    # Save the results into the results_tracker
    results = {'train score' : rfr.score(X_train,y_train),
              'test score' : rfr.score(X_test,y_test),
              'features' : list(data.columns)}
    rfr_results_tracker.append(results)

#### Run Random Forests Models

In [None]:
random_forests(model_data_8)

In [None]:
random_forests(model_data_9)

In [None]:
random_forests(model_data_11)

### Run and Export Final Model

In [None]:
# Using model_data_8
data = model_data_8

# Removes any lingering null values - may need to rethink location of this
data.dropna(inplace=True)
# Assign X and y
y = data['price_int']
X = data.drop(columns='price_int').values

# Split the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=9)

# Instantiate and train the model
rfr = RandomForestRegressor(random_state=9)
rfr.fit(X_train,y_train)
rfr.score(X_train,y_train)
    
print(f'Train score:\t {rfr.score(X_train,y_train)}')
print(f'Test score:\t {rfr.score(X_test,y_test)}')

In [None]:
# from joblib import dump, load
# dump(rfr, 'random_forest_model_8.joblib')

In [None]:
# Rename columns for heatmap
heatmap_data = model_data_8
heatmap_data.columns = ['Price','Cleanliness Score','Host Acceptance Rate',
                             'Communication Score', 'Location Score', 'Instant Bookable',
                             'Short-term', 'Checkin Score', 'Value Score','Accommodates',
                             'Bedrooms', 'Overall Score', 'Borough - Bronx', 'Borough - Brooklyn',
                             'Borough - Manhattan', 'Borough - Queens', 'Borough - Staten Island',
                             'Entire home', 'Hotel room', 'Private room', 'Shared room']

In [None]:
# Plot correlation heatmap using Seaborn
sn.heatmap(heatmap_data.corr());
plt.title('Correlation Heatmap');
plt.savefig('images/correlation_matrix.png')