# Airbnb Hotel Booking Analysis - NYC Dataset

This notebook analyzes the NYC Airbnb dataset and answers the research questions (Q1–Q9).

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, ttest_ind

In [None]:
# Load the dataset
file_path = '/content/1730285881-Airbnb_Open_Data.xlsx'
df = pd.read_excel(file_path)
df.head()

In [None]:
# Data Cleaning
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Fill missing prices, service fees, reviews with 0 where appropriate
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)
if 'service_fee' in df.columns:
    df['service_fee'] = pd.to_numeric(df['service_fee'], errors='coerce').fillna(0)
if 'review_scores_rating' in df.columns:
    df['review_scores_rating'] = pd.to_numeric(df['review_scores_rating'], errors='coerce')
df.info()

### Q1: What are the different property types in the Dataset?

In [None]:
df['room_type'].unique()

### Q2: Which neighborhood group has the highest number of listings?

In [None]:
df['neighbourhood_group'].value_counts()

### Q3: Which neighborhood group has the highest average prices for Airbnb listings?

In [None]:
df.groupby('neighbourhood_group')['price'].mean()

### Q4: Is there a relationship between the construction year of property and price?

In [None]:
if 'construction_year' in df.columns:
    corr, pval = pearsonr(df['construction_year'].dropna(), df['price'].loc[df['construction_year'].notna()])
    print("Correlation:", corr, "p-value:", pval)
else:
    print("No construction_year column in dataset")

### Q5: Who are the top 10 hosts by calculated host listing count?

In [None]:
if 'calculated_host_listings_count' in df.columns:
    df.groupby('host_id')['calculated_host_listings_count'].max().sort_values(ascending=False).head(10)
else:
    print("No calculated_host_listings_count column in dataset")

### Q6: Are hosts with verified identities more likely to receive positive reviews?

In [None]:
if 'host_identity_verified' in df.columns and 'review_scores_rating' in df.columns:
    verified = df[df['host_identity_verified']=='t']['review_scores_rating'].dropna()
    not_verified = df[df['host_identity_verified']=='f']['review_scores_rating'].dropna()
    tstat, pval = ttest_ind(verified, not_verified, equal_var=False)
    print("t-statistic:", tstat, "p-value:", pval)
else:
    print("Required columns not available")

### Q7: Is there a correlation between the price of a listing and its service fee?

In [None]:
if 'service_fee' in df.columns:
    corr, pval = pearsonr(df['price'], df['service_fee'])
    print("Correlation:", corr, "p-value:", pval)
else:
    print("No service_fee column in dataset")

### Q8: What is the average review rate number for listings, and does it vary based on the neighborhood group and room type?

In [None]:
if 'review_scores_rating' in df.columns:
    df.groupby(['neighbourhood_group','room_type'])['review_scores_rating'].mean()
else:
    print("No review_scores_rating column in dataset")

### Q9: Are hosts with a higher calculated host listings count more likely to maintain higher availability throughout the year?

In [None]:
if 'availability_365' in df.columns and 'calculated_host_listings_count' in df.columns:
    corr, pval = pearsonr(df['calculated_host_listings_count'], df['availability_365'])
    print("Correlation:", corr, "p-value:", pval)
else:
    print("Required columns not available")