# Data Analysis of Airbnb Data - New York City
## Importing the Libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import random

np.random.seed(42)
random.seed(42)

## Reading the dataset

In [None]:
newyork = pd.read_csv('C:/Users/Harraj/Documents/GitHub/AirbnbSoen499/data/AB_NYC_2019.csv')
newyork.head(5)


## Checking for Null values in the dataset

In [None]:
#finding out if there are any null or empty values
newyork.isnull().sum()

## Summary of the dataset


In [None]:
#obtaining the description of the dataframe
newyork.describe()

In [None]:
#obtain information about the dataframe
newyork.info()

## Data Processing

In [None]:
#dropping features which we not gonna use.
newyork = newyork.drop(['host_name', 'id', 'host_id', 'name' ], axis=1)

In [None]:
#Dropping the column last review and reviews_per_month as more than 10,000 data points contains null values.
newyork = newyork.drop(['last_review','reviews_per_month'], axis=1)

In [None]:
newyork.head()

In [None]:
newyork.isnull().sum()

# Data Analysis

## Price Distribution

In [None]:

plt.figure(figsize=(20,6))
sb.distplot(newyork['price'], rug=True)

In [None]:
print(sorted(newyork['neighbourhood_group'].unique()))

In [None]:
newyork.neighbourhood_group.value_counts()
#Number of airbnb we have in each neighbour group.

## Price Distribution in Neighbourhood Groups

In [None]:
fig,ax=plt.subplots(figsize=(10,8))
sub_df = newyork[newyork.price < 1000]
plot_2=sb.violinplot(data=sub_df, x='neighbourhood_group', y='price')
plot_2.set_title('Density and distribution of prices for each neighberhood_group')

In [None]:
room_types_neighbourhoods = newyork.groupby(['neighbourhood_group', 'room_type']).mean()
print(room_types_neighbourhoods)

## Price Distribution between different room types

In [None]:
sub_df = newyork[newyork.price < 1000]
plt.figure(figsize = (12, 6))
sb.boxplot(x = 'room_type', y = 'price',  data = sub_df)


## Top 10 neighbourhood with highest number of airbnb

In [None]:
top10_freq_neighbourhood=newyork.neighbourhood.value_counts().head(10)
print(top10_freq_neighbourhood)


## Price distribution between top 10 neighbourhood in New York

In [None]:
top10_freq_neighbourhood_data=newyork[newyork['neighbourhood'].isin(['Williamsburg','Bedford-Stuyvesant','Harlem','Bushwick',
'Upper West Side','Hell\'s Kitchen','East Village','Upper East Side','Crown Heights','Midtown'])]
t=sb.catplot(x="neighbourhood", y="price", col="room_type", data=top10_freq_neighbourhood_data)
t.set_xticklabels(rotation=45)

## Price distribution between different neighbourhood groups

In [None]:
fig,ax=plt.subplots(figsize=(10,8))
sb.countplot(newyork['neighbourhood_group'])

## Longitude vs Latitude (representing price of airbnb in newyork)

In [None]:
newyork.plot.scatter(x='longitude', y='latitude', c='price', figsize=(10,10), cmap='cool', alpha=0.5);


## Longitude vs Latitude
### Dividing the data into costly,medium, reasonable, cheap, very cheap.

In [None]:
newyork['Category'] = newyork['price'].apply(lambda x: 'costly' if x > 3000
                                                    else ('medium' if x >= 1000 and x < 3000
                                                    else ('reasonable' if x >= 500 and x < 1000
                                                     else ('cheap' if x >= 100 and x <500
                                                          else'very cheap'))))
plt.figure(figsize=(10,8))

sb.scatterplot(newyork.latitude,newyork.longitude, hue='Category', data=newyork)

## Relationship between Price and total number of reviews

In [None]:
plt.figure(figsize=(20,8))
plt.scatter(np.log(1+newyork['number_of_reviews']), newyork['price'])
plt.title('Price vs log(reviews)');
plt.xlabel("Reviews")
plt.ylabel("Price")

## Room type Vs Availability

In [None]:
plt.figure(figsize=(20,8))
plt.scatter(newyork['availability_365'], newyork['room_type'])
plt.title('Availability vs Room Type');
plt.xlabel("Availability")
plt.ylabel("Room Type")

## Availabilty of airbnb in each neighbourhood group

In [None]:
fig,ax=plt.subplots(figsize=(10,8))
sub_df = newyork[newyork.price < 1000]
plot_2=sb.violinplot(data=sub_df, x='neighbourhood_group', y='availability_365')

## Correlation

In [None]:
plt.figure(figsize=(20,10))
title = 'Correlation matrix of numerical variables'
sb.heatmap(newyork.corr(), square=True, cmap='RdYlGn')
plt.title(title)
plt.ioff()

## Regression

In [None]:
newyork.head()

In [None]:
newyork.drop(['latitude','longitude','neighbourhood','number_of_reviews'], axis=1, inplace=True)
#examing the changes
newyork.head(5)

In [None]:

#Encode the input Variables
def Encode(newyork):
    for column in newyork.columns[newyork.columns.isin(['neighbourhood_group', 'room_type'])]:
        newyork[column] = newyork[column].factorize()[0]
    return newyork

newyork_encode = Encode(newyork.copy())

In [None]:
newyork_encode.head()

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
x = newyork_encode.iloc[:,[0,1,3,4,5]]
y = newyork_encode['price']
#Getting Test and Training Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)
x_train.head()
y_train.head()
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)