### Data Processing

In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Import NY later
nyc_listings = pd.read_csv('Data/New_York_City.csv')
denver_listings = pd.read_csv('Data/Denver.csv')
# austin_listings = pd.read_csv('Data/Austin.csv')
sf_listings = pd.read_csv('Data/San_Francisco.csv')

In [None]:
# Add city column
denver_listings['city'] = 'Denver'
sf_listings['city'] = 'San Francisco'
nyc_listings['city'] = 'New York City'

In [None]:
# Merge all city datasets
# listings = pd.merge(denver_listings,sf_listings,how='outer')
listings = nyc_listings
listings.tail()

In [None]:
# See all columns in merged dataset
listings.columns

In [None]:
# Identify columns with null values exceeding 1500
null_check = listings.isna().sum()
remove_column_indices = []
remove_columns = []
[remove_columns.append(null_check.index[i]) for i in range(len(null_check)) if null_check[i] > 1500]
remove_columns

In [None]:
# Check columns with most null values
null_check.sort_values(ascending=False).head(20)

In [None]:
# Drop all rows with null values
# listings.dropna(inplace=True)

In [None]:
# Convert t/f to 1/0
listings['host_is_superhost'] = listings['host_is_superhost']\
                                .apply(lambda value: 1 if value == 't' else 0)

listings['instant_bookable'] = listings['instant_bookable']\
                                .apply(lambda value: 1 if value == 't' else 0)
listings.head()

In [None]:
# Convert price variable to float
listings['price_int'] = listings['price'].str.replace('$', '')
listings['price_int'] = listings['price_int'].str.replace(',', '')
listings['price_int'] = listings['price_int'].astype(float)
listings['price_int'].dtype

In [None]:
# Convert host acceptance rate from string to float
listings['host_acceptance_rate'] = listings['host_acceptance_rate']\
                                .str.replace('%', '').astype(float)

listings['host_acceptance_rate'].dtype

In [None]:
# Distinguish between long-term and short-term rentals
listings.loc[listings['minimum_nights'] < 30, 'short-term'] = 1
listings.loc[listings['minimum_nights'] >= 30, 'short-term'] = 0

In [None]:
listings.columns

In [None]:
model_data = pd.DataFrame(listings[['host_acceptance_rate',
       'host_is_superhost', 'host_listings_count', 'short-term',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'number_of_reviews', 'neighbourhood_group_cleansed',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms',
       'city', 'price_int']])

In [None]:
# Ellen still working on this--might not be worth pursuing
number_of_amenities = []

listings_amenities = list(model_data['amenities'])
for amenities in listings_amenities[0:1]:
    number_of_amenities.append(len(list(amenities)))
    print(amenities)
number_of_amenities

In [None]:
# Drop nulls and summarize model data
model_data.dropna(inplace=True)
model_data.describe()

In [None]:
# listings.to_csv('Data/Consolidated_Data.csv')

### Data Investigation

In [None]:
# Check average price and count of listings by neighborhood
avg_price_by_neighborhood = listings.groupby(by=['neighbourhood_cleansed'])['price_int']\
                    .agg(['mean','count'])
avg_price_by_neighborhood = pd.DataFrame(avg_price_by_neighborhood)
avg_price_by_neighborhood.sort_values(['mean'],ascending=False)

In [None]:
# Find neighborhoods with greater than 5 listings
neighborhoods_retain = avg_price_by_neighborhood.loc[avg_price_by_neighborhood['count'] > 5]

#### Linear Model

In [None]:
# initialize empty list to store linear model results
results_tracker = []

def linear_model(data):
    # Removes any lingering null values - may need to rethink location of this
    data.dropna(inplace=True)
    # Assign X and y
    y = data['price_int']
    X = data.drop(columns='price_int').values

    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

    # Instantiate and train the model
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    reg.score(X_train,y_train)
    
    print(f'Train score:\t {reg.score(X_train,y_train)}')
    print(f'Test score:\t {reg.score(X_test,y_test)}')
    
    # Save the results into the results_tracker
    results = {'train score' : reg.score(X_train,y_train),
              'test score' : reg.score(X_test,y_test),
              'features' : list(data.columns)}
    results_tracker.append(results)

#### Run Linear Models

In [None]:
model_data_1 = model_data[['price_int','bedrooms','accommodates']]
linear_model(model_data_1)

In [None]:
model_data_2 = model_data[['price_int','accommodates','review_scores_rating']]
linear_model(model_data_2)

In [None]:
model_data_3 = model_data[['host_is_superhost','room_type', 'accommodates','bedrooms','number_of_reviews', 
                   'review_scores_rating','city', 'price_int']]
model_data_3 = pd.get_dummies(model_data_3)
linear_model(model_data_3)

In [None]:
model_data_4 = pd.get_dummies(model_data[['price_int','neighbourhood_cleansed','review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin', 'number_of_reviews','host_acceptance_rate',
       'review_scores_communication', 'review_scores_location', 'instant_bookable',
       'review_scores_value','host_is_superhost','accommodates','bedrooms','review_scores_rating','room_type']])
linear_model(model_data_4)

In [None]:
model_data_5 = pd.get_dummies(model_data[['price_int','city','accommodates', 'bedrooms',
                                          'review_scores_rating','number_of_reviews',
                                          'host_is_superhost','instant_bookable','room_type']])
linear_model(model_data_5)

In [None]:
model_data_6 = pd.get_dummies(model_data[['price_int','city','accommodates', 'bedrooms',
                                          'review_scores_rating','number_of_reviews',
                                          'host_is_superhost','instant_bookable','room_type',
                                          'short-term']])
linear_model(model_data_6)

In [None]:
model_data_7 = pd.get_dummies(model_data[['host_is_superhost','room_type', 'accommodates','bedrooms','number_of_reviews', 
                   'review_scores_rating','price_int','neighbourhood_group_cleansed']])
linear_model(model_data_7)

In [None]:
model_data_8 = pd.get_dummies(model_data[['price_int','neighbourhood_group_cleansed','review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin', 'number_of_reviews','host_acceptance_rate',
       'review_scores_communication', 'review_scores_location', 'instant_bookable','short-term',
       'review_scores_value','host_is_superhost','accommodates','bedrooms','review_scores_rating','room_type']])
linear_model(model_data_8)

#### Review Results

In [None]:
# Remove duplicates from results_tracker and sort results in order of best training score
results_summary = []
[results_summary.append(x) for x in results_tracker if x not in results_summary]
results_summary.sort(key=lambda result: result['train score'],reverse=True)

# Print best result
print(f'Best training score: {results_summary[0]["train score"]}')
print(f'Best features: {results_summary[0]["features"]}')

#### Random Forest Classifier

In [None]:
rfr_results_tracker = []

def random_forests(data):
    # Removes any lingering null values - may need to rethink location of this
#     data.dropna(inplace=True)
    # Assign X and y
    y = data['price_int']
    X = data.drop(columns='price_int').values

    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

    # Instantiate and train the model
    rfr = RandomForestRegressor(random_state=9)
    rfr.fit(X_train,y_train)
    rfr.score(X_train,y_train)
    
    print(f'Train score:\t {rfr.score(X_train,y_train)}')
    print(f'Test score:\t {rfr.score(X_test,y_test)}')
    
    # Save the results into the results_tracker
    results = {'train score' : rfr.score(X_train,y_train),
              'test score' : rfr.score(X_test,y_test),
              'features' : list(data.columns)}
    rfr_results_tracker.append(results)

#### Run Random Forests Models

In [None]:
random_forests(model_data_8)