## Import Packages

In [1]:
import os
import pandas as pd, numpy as np, matplotlib.pyplot as plt, statsmodels.formula.api as smf
import seaborn as sns
os.getcwd()

'C:\\Users\\Kai Jing\\Desktop\\NUS\\Business (Accountancy)\\NUS BAC\\Sem 2.1\\DAO2702 - Programming for Business Analytics\\Group Project\\data2'

## Merging Data

In [None]:
df = pd.read_csv("January 2017.csv")
remaining_months = ['February','March','April','May','June','July','August','September','October','November','December']

for i in remaining_months:
    file_name = str(i) + ' 2017.csv'
    df_add = pd.read_csv(file_name)
    df = pd.concat([df,df_add])
df.reset_index(drop=True,inplace=True)
df

## Selecting data variables to analyze

In [None]:
col_update = ['amenities','bathrooms','beds','calendar_last_scraped','id','monthly_price','neighbourhood_cleansed','price','property_type','review_scores_communication','review_scores_location','review_scores_rating','review_scores_value','room_type']
# len(col_update)
data = df['accommodates']
for i in col_update:
    data = pd.concat([data,df[i]],axis=1)

## Checking empty cells

In [None]:
data.isna().sum()

## Dropping rows with NaN data (missing data)

In [None]:
df_col = list(data.columns)
# df_col, len(df_col)
for i in df_col:
    data = data[data[i].notna()]
data.isna().sum()

In [None]:
data.reset_index(drop=True,inplace=True)
data

## Creating average review score base on the review scores of communication, location, rating and value

In [None]:
data['review_scores_rating'] = data['review_scores_rating'].apply(lambda x: x/10)
data['review_score_average'] = data[['review_scores_communication','review_scores_location','review_scores_rating','review_scores_value']].mean(axis=1)
data.drop(['review_scores_communication','review_scores_location','review_scores_rating','review_scores_value'],inplace=True,axis=1)

## Converting prices to integers

In [None]:
data['price'] = data['price'].apply(lambda x: x.replace(',',''))
data['price'] = data['price'].apply(lambda x: int(x[1:-3]))
data['monthly_price'] = data['monthly_price'].apply(lambda x: x.replace(',',''))
data['monthly_price'] = data['monthly_price'].apply(lambda x: int(x[1:-3]))

##  Creating amenities count

In [None]:
data['amenities_count'] = data['amenities'].str.split(',').apply(lambda x: len(x))
data.drop('amenities',inplace=True,axis=1)

## Extracting month of data extracted

In [None]:
data['calendar_last_scraped'] = pd.to_datetime(data['calendar_last_scraped'], format='%Y-%m-%d')
data['month_scraped'] = data['calendar_last_scraped'].dt.strftime('%b')
data.drop('calendar_last_scraped',inplace=True,axis=1)

In [None]:
# Checking for correctness of amenities_count
# for i in range(len(data)):
#     data.loc[i,'amenities_count2'] = len(data['amenities'][i].split(','))
# print(sum(data['amenities_count'] == data['amenities_count2']))
# data.drop('amenities_count2',inplace=True,axis=1)

## Further data cleaning

In [None]:
#group by id
data_by_id = data.groupby('id')

#finding means of numerical variables
data2 = data_by_id[['price']].mean()
data2['mean_monthly_price'] = data_by_id[['monthly_price']].mean()
data2['mean_amenities'] = data_by_id[['amenities_count']].mean()
data2['mean_accommodates'] = data_by_id[['accommodates']].mean()
data2['mean_bathrooms'] = data_by_id[['bathrooms']].mean()
data2['mean_beds'] = data_by_id[['beds']].mean()
data2['mean_reviews'] = data_by_id[['review_score_average']].mean()

#extracting last month of catergorical variables
data2['neighbourhood'] = data_by_id[['neighbourhood_cleansed']].last()
data2['property_type'] = data_by_id[['property_type']].last()
data2['room_type'] = data_by_id[['room_type']].last()

data2.reset_index(inplace=True)
data2.head()

In [None]:
data2.to_csv('data.csv',index=False)

## Backward Stepwise Model

In [None]:
def back_sel(data, y, current_predictors):
    #print(len(current_predictors), current_predictors)
    aic_k_best = 1e+8
    for predictor_to_remove in current_predictors:
        new_predictors = current_predictors.copy()
        new_predictors.remove(predictor_to_remove)
        if new_predictors:
            formula = y + ' ~ ' + ' + '.join(new_predictors)
        else:
            formula = y + ' ~ 1'
        model = smf.ols(formula, data=data).fit()
        #print('\t', model.aic, new_predictors)
        if model.aic <= aic_k_best:
            aic_k_best = model.aic
            model_k_best = model
            predictors_k_best = new_predictors
    #print('best: ', model_k_best.aic, predictors_k_best)
    return model_k_best, predictors_k_best

In [None]:
current_predictors = set(list(data2.columns))
current_predictors.remove('id')
current_predictors.remove('price')
model_full = smf.ols('price ~ ' + ' + '.join(current_predictors), data=data2).fit()
aic_bwd = model_full.aic
model_bwd = model_full
predictors_bwd = current_predictors

In [None]:
while len(current_predictors) > 0:
    model, current_predictors = back_sel(data2, 'price', current_predictors)
    print(len(current_predictors), model.aic, current_predictors)
    if model.aic <= aic_bwd:
        aic_bwd = model.aic
        model_bwd = model
        predictors_bwd = current_predictors
print('Best Model: ', model_bwd.aic, predictors_bwd)

In [None]:
model_bwd.summary()

In [None]:
data2[list(predictors_bwd)].corr()

## Forward Stepwise Model

In [None]:
def forward_selection(data, y, predictors, current_predictors):
    #print(len(current_predictors), current_predictors)
    aic_k_best = 1e+8
    for predictor_to_add in (predictors - current_predictors):
        new_predictors = current_predictors.copy()
        new_predictors.add(predictor_to_add)
        formula = y + ' ~ ' + ' + '.join(new_predictors)
        model = smf.ols(formula, data=data).fit()
        #print('\t', model.aic, new_predictors)
        if model.aic <= aic_k_best:
            aic_k_best = model.aic
            model_k_best = model
            predictors_k_best = new_predictors
    #print('best: ', model_k_best.aic, predictors_k_best)
    return model_k_best, predictors_k_best

# Step 1: start with Model 0
predictors = set(list(data2.columns))
predictors.remove('id')
predictors.remove('price')
current_predictors = set()
model_0 = smf.ols('price ~ 1', data=data2).fit()
aic_fwd = model_0.aic
model_fwd = model_0
predictors_fwd = current_predictors
# Step 2: iteratively add predictor one by one

while len(current_predictors) < len(predictors):
    model, current_predictors = forward_selection(data2, 'price', predictors, current_predictors)
    print(len(current_predictors), model.aic, current_predictors)
    if model.aic <= aic_fwd:
        aic_fwd = model.aic
        model_fwd = model
        predictors_fwd = current_predictors
print('Best Model: ', model_fwd.aic, predictors_fwd)

## Checking for multicollinearity 

In [None]:
data2[list(predictors_fwd)].corr()

In [None]:
model_fwd.summary()

## Best Subset Model

In [None]:
data2_predictors = list(data2.columns)
data2_predictors.remove('price')
data2_predictors.remove('id')
p = len(data2_predictors)
import itertools
def best_subset(data, y, predictors, k):
    if k == 0:
        predictors_k_best = set()
        model_k_best = smf.ols(y + ' ~ 1', data=data).fit()
    else:
        aic_k_best = 1e+8
        for predictor_set in itertools.combinations(predictors, k):
            formula_k = y + ' ~ ' + ' + '.join(predictor_set)
            model_k = smf.ols(formula_k, data=data).fit()
            #print('\t', model_k.aic, predictor_set)
            if model_k.aic <= aic_k_best:
                aic_k_best = model_k.aic
                model_k_best = model_k
                predictors_k_best = predictor_set
    return model_k_best, predictors_k_best

In [None]:
aic_best = 1e+8
for k in range(0, p+1, 1):
    model_k, predictors_k = best_subset(data2, 'price', data2_predictors, k)
    print(k, model_k.aic, predictors_k)
    if model_k.aic <= aic_best:
        aic_best = model_k.aic
        k_best = k
        model_best = model_k
        predictors_best = predictors_k
print('Best Model: ', k_best, model_best.aic, predictors_best)

In [None]:
model_best.summary()

In [None]:
data2[list(predictors_best)].corr()

## Removing variable with multicollinearity

In [None]:
data2.drop('mean_beds',inplace=True,axis=1)
data2.head()

## Running Best Subset Model again

In [None]:
data2_predictors = list(data2.columns)
data2_predictors.remove('price')
data2_predictors.remove('id')
p = len(data2_predictors)
import itertools
def best_subset(data, y, predictors, k):
    if k == 0:
        predictors_k_best = set()
        model_k_best = smf.ols(y + ' ~ 1', data=data).fit()
    else:
        aic_k_best = 1e+8
        for predictor_set in itertools.combinations(predictors, k):
            formula_k = y + ' ~ ' + ' + '.join(predictor_set)
            model_k = smf.ols(formula_k, data=data).fit()
            #print('\t', model_k.aic, predictor_set)
            if model_k.aic <= aic_k_best:
                aic_k_best = model_k.aic
                model_k_best = model_k
                predictors_k_best = predictor_set
    return model_k_best, predictors_k_best

aic_best = 1e+8
for k in range(0, p+1, 1):
    model_k, predictors_k = best_subset(data2, 'price', data2_predictors, k)
    print(k, model_k.aic, predictors_k)
    if model_k.aic <= aic_best:
        aic_best = model_k.aic
        k_best = k
        model_best = model_k
        predictors_best = predictors_k
print('Best Model: ', k_best, model_best.aic, predictors_best)

In [None]:
model_best.summary()

In [None]:
data2[list(predictors_best)].corr()

# Comparing price and numerical predictors 

### Price per Amenity 

In [None]:
data2['mean_bathrooms'].replace(0,np.nan,inplace=True)
data2 = data2[data2['mean_bathrooms'].notna()].reset_index(drop=True)
data2

In [None]:
data2['price_per_amenity'] = data2['price'] / data2['mean_amenities']
data2

#### Rank top 10 neighbourhood by Price per Amenity 

In [None]:
rank_amenities = (data2.groupby('neighbourhood')['price_per_amenity'].mean()).sort_values()
rank_amenities = rank_amenities.to_frame().reset_index()
rank_amenities['amenity_rank'] = rank_amenities['neighbourhood'].apply(lambda x: '')
for i in range(len(rank_amenities)):
    rank_amenities['amenity_rank'][i] = 22 - i
rank_amenities

In [None]:
sns.regplot(x='mean_amenities', y='price', data=data2)
plt.ylim(0,2250)

In [None]:
# Plot Neighbourhood against Price per Amenity
plt.figure(figsize=(15,10))
plt.bar(rank_amenities['neighbourhood'],rank_amenities['price_per_amenity'])
plt.title("Price per Amenity in each Neighbourhood")
plt.xlabel('Neighbourhood')
plt.ylabel('Price per Amenity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

### Price per Accommodates

In [None]:
data2['price_per_accommodates'] = data2['price'] / data2['mean_accommodates']
data2[['price_per_accommodates']]

#### Rank top 10 neighbourhood by Price per Accommodates 

In [None]:
rank_accommodates = (data2.groupby('neighbourhood')['price_per_accommodates']).mean().sort_values()
rank_accommodates = rank_accommodates.to_frame().reset_index()
rank_accommodates['accommodates_rank'] = rank_accommodates['neighbourhood'].apply(lambda x: '')
for i in range(len(rank_accommodates)):
    rank_accommodates['accommodates_rank'][i] = 22 - i
rank_accommodates

In [None]:
sns.regplot(x='mean_accommodates', y='price', data=data2)

In [None]:
# Plot Neighbourhood against Price per Accommodates
plt.figure(figsize=(15,10))
plt.bar(rank_accommodates['neighbourhood'],rank_accommodates['price_per_accommodates'])
plt.title("Price per Accommodates in each Neighbourhood")
plt.xlabel('Neighbourhood')
plt.ylabel('Price per Accommodates')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

### Price per bathroom

In [None]:
data2['price_per_bathroom'] = data2['price'] / data2['mean_bathrooms']
data2[['price_per_bathroom']]

#### Rank top 10 neighbourhood by Price per Bathroom

In [None]:
rank_bathrooms = (data2.groupby('neighbourhood')['price_per_bathroom'].mean()).sort_values()
rank_bathrooms = rank_bathrooms.to_frame().reset_index()
rank_bathrooms['bathrooms_rank'] = rank_bathrooms['neighbourhood'].apply(lambda x: '')
for i in range(len(rank_bathrooms)):
    rank_bathrooms['bathrooms_rank'][i] = 22 - i
rank_bathrooms

In [None]:
sns.regplot(x='mean_bathrooms', y='price', data=data2)

In [None]:
# Plot Neighbourhood against Price per Bathroom
plt.figure(figsize=(15,10))
plt.bar(rank_bathrooms['neighbourhood'],rank_bathrooms['price_per_bathroom'])
plt.title("Price per Bathroom in each Neighbourhood")
plt.xlabel('Neighbourhood')
plt.ylabel('Price per Bathroom')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

### Price vs Review Score

In [None]:
print(type(data2['mean_reviews'][1]))

In [None]:
# rank_review = (data2.groupby('neighbourhood')['mean_reviews']).count('10.000000')
# # .filter(lambda x: int(x)>9.99999)
# # rank_review['number of max review score'] = rank_review['neighbourhood'][:191].value_counts()
# rank_review
rank_review = data2[['neighbourhood', 'mean_reviews']].sort_values('mean_reviews', ascending=False).reset_index(drop=True)
rank_review['mean_reviews'] = rank_review['mean_reviews'].apply(lambda x: float(x))
rank_review = rank_review[rank_review['mean_reviews']>9.4]
rank_review = rank_review['neighbourhood'].value_counts().to_frame(name='max_review_count').reset_index()
rank_review.columns = ['neighbourhood', 'max_review_count']
rank_review['review_rank'] = rank_review['max_review_count'].apply(lambda x: '')
for i in range(len(rank_review)):
    rank_review['review_rank'][i] = 22 - i
rank_review

### Ranking of Neighbourhood

In [None]:
overall_rank = rank_accommodates[['neighbourhood','accommodates_rank']]
overall_rank = pd.merge(overall_rank, rank_amenities[['neighbourhood','amenity_rank']],on='neighbourhood')
overall_rank = pd.merge(overall_rank, rank_bathrooms[['neighbourhood', 'bathrooms_rank']],on = 'neighbourhood')
overall_rank = pd.merge(overall_rank, rank_review[['neighbourhood','review_rank']],on='neighbourhood',how='outer')
overall_rank['review_rank'].fillna(5, inplace=True)
overall_rank['overall_ranking'] = overall_rank.loc[:,'accommodates_rank':'review_rank'].sum(axis=1)
overall_rank = overall_rank.sort_values('overall_ranking',ascending=False).reset_index(drop=True) 
overall_rank

### Top 3 Neighbourhoods

In [None]:
top_neighbourhood = overall_rank.loc[0,'neighbourhood']
second_neighbourhood = overall_rank.loc[1,'neighbourhood']
third_neighbourhood = overall_rank.loc[2,'neighbourhood']

f, axarr = plt.subplots(1,3, sharey=True)
axarr[0].boxplot(data2.loc[data2['neighbourhood'] == top_neighbourhood, 'price'])
axarr[0].set_xlabel(top_neighbourhood)
axarr[0].set_xticks([])
axarr[1].boxplot(data2.loc[data2['neighbourhood'] == second_neighbourhood, 'price'])
axarr[1].set_xlabel(second_neighbourhood)
axarr[1].set_xticks([])
axarr[1].set_title('Prices of Top 3 Neighbourhoods')
axarr[2].boxplot(data2.loc[data2['neighbourhood'] == third_neighbourhood, 'price'])
axarr[2].set_xlabel(third_neighbourhood)
axarr[2].set_xticks([])

In [None]:
data3 = data2.groupby('neighbourhood')['price'].mean().to_frame(name='mean_price').reset_index()
plt.figure(figsize=(15,10))

plt.bar(data3.iloc[:,0],data3.iloc[:,1])
plt.title("Average Price in each Neighbourhood")
plt.xlabel('Neighbourhood')
plt.ylabel('Average Price')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()