In [227]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [228]:
# Load dataset
df = pd.read_csv('AB_NYC_2019.csv')

In [229]:
# Remove rows with NaN and reset index
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [230]:
# Convert integer columns to appropriate types (if needed)
int_columns = ['host_id', 'price', 'minimum_nights', 'number_of_reviews', 
               'calculated_host_listings_count', 'availability_365']

In [231]:
for col in int_columns:
    df[col] = df[col].astype(int)

In [232]:
# Extract top 50 words from 'name' feature using Bag-of-Words
vectorizer = CountVectorizer(max_features=600, stop_words='english')
bow_matrix = vectorizer.fit_transform(df['name'])

In [233]:
bow_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 168746 stored elements and shape (38821, 600)>

In [234]:
# Get word frequencies
word_counts = bow_matrix.sum(axis=0).A1
vocab = vectorizer.get_feature_names_out()

In [235]:
# Create a DataFrame with words and their frequencies
word_freq_df = pd.DataFrame({'word': vocab, 'count': word_counts})
word_freq_df.sort_values(by='count', ascending=True, inplace=True)

In [236]:
word_freq_df

Unnamed: 0,word,count
111,bth,23
462,row,24
157,convenience,24
444,rest,24
443,residence,24
...,...,...
166,cozy,4262
49,apartment,5376
425,private,6178
83,bedroom,6444


In [237]:
# Get the top 50 words
top_50_words = vectorizer.get_feature_names_out()

In [238]:
# Convert the list of words into a DataFrame
top_50_df = pd.DataFrame(top_50_words, columns=['word'])

In [239]:
# Save the DataFrame as a CSV file
top_50_df.to_csv('top_50_words.csv', index=False)

In [240]:
top_50_words

array(['10', '10min', '12', '15', '15min', '15mins', '1b', '1ba', '1bd',
       '1bdr', '1bdrm', '1bedroom', '1br', '1st', '20', '20min', '20mins',
       '25', '2b', '2ba', '2bath', '2bd', '2bdrm', '2bed', '2bedroom',
       '2br', '2nd', '30', '3bd', '3br', '3rd', '45', '4br', '5min',
       '5th', 'abode', 'ac', 'access', 'adorable', 'affordable', 'air',
       'airbnb', 'airport', 'airports', 'airy', 'alcove', 'amazing',
       'amenities', 'apart', 'apartment', 'apple', 'apt', 'area', 'art',
       'artist', 'artistic', 'artists', 'artsy', 'astoria', 'authentic',
       'available', 'ave', 'avenue', 'away', 'awesome', 'backyard',
       'balcony', 'barclays', 'basement', 'bath', 'bathroom', 'baths',
       'bay', 'bd', 'bdr', 'bdrm', 'beach', 'beautiful', 'beautifully',
       'beauty', 'bed', 'bedford', 'bedrm', 'bedroom', 'bedrooms', 'beds',
       'bedstuy', 'best', 'big', 'bk', 'bklyn', 'bldg', 'block', 'blocks',
       'blue', 'blueground', 'boerum', 'bohemian', 'boho', 'bout

In [241]:
bow_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 168746 stored elements and shape (38821, 600)>

In [242]:
# Create a DataFrame with the top 50 features
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=top_50_words)

In [243]:
# Concatenate the original dataset with the Bag-of-Words features
processed_df = pd.concat([df.reset_index(drop=True), bow_df], axis=1)

In [244]:
# Drop the original 'name' column (optional)
processed_df.drop(columns=['name'], inplace=True)

In [245]:
# Sort by frequency to get the top 50 words
top_50_df = word_freq_df.sort_values(by='count', ascending=False).head(600)

# Save the top 50 words and their frequencies as a CSV file
top_50_df.to_csv('top_50_words_with_frequencies.csv', index=False)

In [246]:
# Display the resulting dataset
processed_df.head()

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,...,women,wonderful,woodside,world,wyndham,xl,yankee,yard,york,zen
0,2539,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,...,0,0,0,0,0,0,0,0,0,0
1,2595,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,...,0,0,0,0,0,0,0,0,0,0
2,3831,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,...,0,0,0,0,0,0,0,0,0,0
3,5022,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,...,0,0,0,0,0,0,0,0,0,0
4,5099,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,...,0,0,0,0,0,0,0,0,0,0


In [247]:
processed_df.memory_usage(deep=True).sum()


np.int64(200893891)

In [248]:
processed_df.shape

(38821, 615)

In [249]:
# Drop unwanted columns
processed_df = processed_df.drop(columns=['host_name', 'host_id', 'id','minimum_nights','last_review','reviews_per_month','calculated_host_listings_count','availability_365'])

# Display the updated DataFrame
processed_df.head()


Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,10,10min,12,...,women,wonderful,woodside,world,wyndham,xl,yankee,yard,york,zen
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,45,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,270,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,74,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [250]:
processed_df.shape

(38821, 607)

In [251]:
# from sklearn.cluster import KMeans

# # Use only latitude and longitude to create geographical clusters
# geo_data = processed_df[['latitude', 'longitude']].values

# # Fit a k-means clustering model (e.g., 10 clusters)
# kmeans = KMeans(n_clusters=10, random_state=42)
# processed_df['geo_cluster'] = kmeans.fit_predict(geo_data)

# # Now, use 'geo_cluster' as an additional feature for similarity computation


In [252]:
# processed_df['geo_cluster'].unique()

In [253]:
# from sklearn.preprocessing import StandardScaler

# # Normalize the 'price' column
# scaler = StandardScaler()
# processed_df['price_scaled'] = scaler.fit_transform(processed_df[['price']])


In [254]:
# processed_df['price_scaled'].unique()

In [281]:
import numpy as np
import pandas as pd
import faiss

# 1. Prepare the Data
# Keep only numeric and one-hot encoded columns
data = processed_df.select_dtypes(include=[np.number]).values

# Normalize the data for cosine similarity
data_normalized = data / np.linalg.norm(data, axis=1, keepdims=True)

# 2. Create and Train the FAISS Index
d = data_normalized.shape[1]  # Dimensionality of the data
index = faiss.IndexFlatIP(d)  # Inner Product (cosine similarity)
index.add(data_normalized)    # Add data to the index

# Save the model
faiss.write_index(index, 'airbnb_recommender.index')

# 3. Query the Model
def recommend(new_data, k=3):
    """
    Given a new listing, recommend k similar listings.
    
    Args:
        new_data (numpy array): The feature vector for the new listing.
        k (int): Number of recommendations.
    
    Returns:
        indices (list): Indices of the top k similar listings.
        distances (list): Similarity scores of the top k listings.
    """
    # Normalize the new data
    new_data_normalized = new_data / np.linalg.norm(new_data, axis=1, keepdims=True)
    
    # Perform a search
    distances, indices = index.search(new_data_normalized, k)
    return indices, distances

# Example: Recommend for the first listing in the dataset
new_listing = data_normalized[0].reshape(1, -1)
recommended_indices, recommended_distances = recommend(new_listing)

# Output the recommendations
print("Recommended Indices:", recommended_indices[0])
print("Recommended Distances:", recommended_distances[0])

# Save index for future use
faiss.write_index(index, 'airbnb_recommender.index')


Recommended Indices: [    0  3730 24384]
Recommended Distances: [0.99999994 0.9999118  0.999911  ]


In [274]:
new_listing.shape

(1, 604)

In [275]:
# Load the new listing from the CSV file
loaded_new_listing = pd.read_csv('new_listing.csv', header=None)

# Convert the loaded DataFrame to a 1D numpy array and reshape it to (1, 604)
loaded_new_listing = loaded_new_listing.values

# Verify the shape
print(loaded_new_listing.shape)  # Should be (1, 604)



(1, 604)


In [276]:
loaded_new_listing

array([[ 0.13932279, -0.25283771,  0.44449031,  0.84795075,  0.        ,
         0.        ,  0.00341916,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.00341916,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.00341916,
         0.        ,  0.        ,  0.        ,  0. 

In [277]:
# Now you can pass loaded_new_listing into your recommendation function
recommended_indices, recommended_distances = recommend(loaded_new_listing)

# Output the recommendations
print("Recommended Indices:", recommended_indices[0])
print("Recommended Distances:", recommended_distances[0])

Recommended Indices: [  41 2919  637]
Recommended Distances: [0.9999299 0.9998989 0.9998839]


In [260]:
data.shape

(38821, 604)

In [261]:
processed_df.shape

(38821, 607)

In [262]:
d

604

In [278]:
index = faiss.read_index('airbnb_recommender.index')
print(f"Loaded index has {index.ntotal} vectors with {index.d} dimensions")


Loaded index has 38821 vectors with 604 dimensions


In [279]:
print(f"Number of vectors before saving: {index.ntotal}")


Number of vectors before saving: 38821
