In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



: 

In [None]:
# Load the dataset
data = pd.read_csv('data/NYC_air_bnb_dataset.csv')

: 

In [None]:
# Display basic information about the dataset
data.info()

: 

In [None]:
# Display the first few rows of the dataset
data.head()

: 

In [None]:
# Summary statistics
data.describe()

: 

In [None]:
# Check for missing values
data.isnull().sum()

: 

In [None]:
# Data visualization
# Distribution of room types
plt.figure(figsize=(10, 6))
sns.countplot(data['room_type'])
plt.title('Distribution of Room Types')
plt.show()

: 

In [None]:
# Price distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

: 

In [None]:
# Correlation matrix
correlation_matrix = data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

: 

In [None]:
# Average price by neighborhood group
average_price_by_neighborhood = data.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=average_price_by_neighborhood.index, y=average_price_by_neighborhood.values)
plt.title('Average Price by Neighborhood Group')
plt.xticks(rotation=45)
plt.show()

: 

In [None]:
# Number of reviews vs. Price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_reviews', y='price', data=data)
plt.title('Number of Reviews vs. Price')
plt.xlabel('Number of Reviews')
plt.ylabel('Price')
plt.show()

: 

In [None]:
# Relationship between availability and price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='availability_365', y='price', data=data)
plt.title('Availability vs. Price')
plt.xlabel('Availability in 365 Days')
plt.ylabel('Price')
plt.show()

: 

In [None]:
# Pairplot for numerical variables
sns.pairplot(data[['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']])
plt.show()

: 

In [None]:
# Price by Room Type and Neighborhood
plt.figure(figsize=(12, 6))
sns.boxplot(x='room_type', y='price', hue='neighbourhood_group', data=data)
plt.title('Price by Room Type and Neighborhood')
plt.xticks(rotation=45)
plt.show()

: 

In [None]:
# Price by Minimum Nights Requirement
plt.figure(figsize=(10, 6))
sns.scatterplot(x='minimum_nights', y='price', data=data)
plt.title('Minimum Nights vs. Price')
plt.xlabel('Minimum Nights')
plt.ylabel('Price')
plt.show()

: 

In [None]:
# Number of Listings by Host
host_listings_count = data['calculated_host_listings_count'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=host_listings_count.index, y=host_listings_count.values)
plt.title('Number of Listings per Host')
plt.xlabel('Number of Listings')
plt.ylabel('Count')
plt.show()

: 

In [None]:
# Reviews per Month vs. Price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reviews_per_month', y='price', data=data)
plt.title('Reviews per Month vs. Price')
plt.xlabel('Reviews per Month')
plt.ylabel('Price')
plt.show()

: 

In [None]:

# Availability vs. Number of Reviews
plt.figure(figsize=(10, 6))
sns.scatterplot(x='availability_365', y='number_of_reviews', data=data)
plt.title('Availability vs. Number of Reviews')
plt.xlabel('Availability in 365 Days')
plt.ylabel('Number of Reviews')
plt.show()


: 

In [None]:
host_counts = data['host_id'].value_counts()
multiple_listings_hosts = host_counts[host_counts > 1]

# Create a new DataFrame with hosts who have multiple listings
multi_listings_data = data[data['host_id'].isin(multiple_listings_hosts.index)]

plt.figure(figsize=(12, 6))
sns.scatterplot(x='calculated_host_listings_count', y='price', data=multi_listings_data)
plt.title('Number of Listings by Host vs. Price')
plt.xlabel('Number of Listings by Host')
plt.ylabel('Price')
plt.show()


: 

In [None]:
data['last_review'] = pd.to_datetime(data['last_review'])
data.set_index('last_review', inplace=True)

# Resample the data to monthly frequency
monthly_reviews = data['number_of_reviews'].resample('M').sum()

plt.figure(figsize=(12, 6))
sns.lineplot(x=monthly_reviews.index, y=monthly_reviews.values)
plt.title('Monthly Reviews Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Reviews')
plt.show()


: 

In [None]:
from textblob import TextBlob

# Perform sentiment analysis on the 'name' column
data['name_sentiment'] = data['name'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Visualize the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['name_sentiment'], bins=30, kde=True)
plt.title('Sentiment Analysis of Listing Names')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Count')
plt.show()


: 

In [None]:
import folium

# Create a map centered around New York City
ny_map = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

# Add markers for each listing
for index, row in data.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"Price: ${row['price']}, Type: {row['room_type']}",
    ).add_to(ny_map)

# Display the map
ny_map


: 

: 