# Precomputing similarity matrices and saving the trained model

In [91]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import joblib

In [92]:
# listings = pd.read_csv('./NewYork/full_merged_translated.csv')
listings = pd.read_csv('./full_removed_toosimilar.csv')

In [93]:
vectorizer = TfidfVectorizer(stop_words='english', analyzer='word')
tfidf_matrix = vectorizer.fit_transform(listings['description'])
desc_cosine_sim = cosine_similarity(tfidf_matrix)

In [94]:
def convert_str_to_array(data, remove_newline=False):
    cleaned_string = data.replace('\n', ' ') if remove_newline else data
    cleaned_string = cleaned_string.strip('[]')
    vector_array = np.fromstring(cleaned_string, sep=' ')
    return vector_array

In [95]:
listings['photo_vector'] = listings['photo_vector'].apply(lambda x: convert_str_to_array(x, remove_newline=True))
img_vectors = np.array(listings['photo_vector'].tolist())
img_cosine_sim = cosine_similarity(img_vectors)

In [96]:
listings['polarity'] = listings['polarity'].apply(lambda x: convert_str_to_array(x))
polarity = np.array(listings['polarity'].tolist())

In [97]:
scaler = MinMaxScaler()
price_features = scaler.fit_transform(listings[['price']])
location_features = scaler.fit_transform(listings[['latitude', 'longitude']])

In [98]:
combined_features = np.hstack([
    desc_cosine_sim,
    img_cosine_sim,
    polarity,
    price_features,
    location_features
])

In [99]:
np.savez_compressed('combined_features.npz', combined_features=combined_features)

In [100]:
def train_knn_model(features):
    knn = NearestNeighbors(n_neighbors=11, metric='euclidean')  # 10 neighbors + 1 for the query itself
    knn.fit(features)
    return knn

In [101]:
knn_model = train_knn_model(combined_features)

In [81]:
def get_knn_recommendations(knn_model, listing_index, listings, k=10):
    _, indices = knn_model.kneighbors([combined_features[listing_index]])
    recommendations = listings.iloc[indices[0][1:k+1]].copy()
    
    recommendations['desc_similarity'] = desc_cosine_sim[listing_index, indices[0][1:k+1]]
    recommendations['img_similarity'] = img_cosine_sim[listing_index, indices[0][1:k+1]]
    
    filtered_recommendations = recommendations[
        ~((recommendations['desc_similarity'] > 0.98) | (recommendations['img_similarity'] > 0.98))
    ]
    
    return filtered_recommendations[['id', 'name', 'description', 'picture_url', 'price']]


In [42]:
def get_knn_recommendations(knn_model, listing_index, listings, k=10):
    _, indices = knn_model.kneighbors([combined_features[listing_index]])
    recommendations = listings.iloc[indices[0][1:k+1]][['id', 'name', 'description', 'picture_url', 'price']]
    return recommendations

In [21]:
listings.shape

(21411, 55)

# Remove the most similar listings

In [29]:
def remove_similar_listings(listings, desc_cosine_sim, img_cosine_sim, similarity_threshold=0.98):
    to_remove = set()

    for i in range(len(listings)):
        if i in to_remove:
            continue
        similar_indices = np.where(
            (desc_cosine_sim[i] > similarity_threshold) & (img_cosine_sim[i] > similarity_threshold)
        )[0]
        for idx in similar_indices:
            if idx != i:
                to_remove.add(idx)

    to_keep = [i for i in range(len(listings)) if i not in to_remove]
    return listings.iloc[to_keep].reset_index(drop=True)


In [30]:
removed_too_similar = remove_similar_listings(listings, desc_cosine_sim, img_cosine_sim, similarity_threshold=0.98)

In [31]:
removed_too_similar.shape

(20671, 55)

In [32]:
removed_too_similar.to_csv('./full_removed_toosimilar.csv', index=False)

In [89]:
id_listing = 20253 # 19853
recommended_listings = get_knn_recommendations(knn_model, id_listing, listings, k=10, similarity_threshold=0.98)

In [90]:
for _, row in recommended_listings.iterrows():
    print(f"{row['name']}\n")
    print(f"{row['price']}\n")
    print(f"{row['description']}\n")
    print(f"{row['picture_url']}\n")

Elegant 1 bedroom suite in midtown Manhattan

200.0

Furnished one bedroom with private bath and  fully equipped kitchen in Midtown. 3 blocks from Grand Central Station. Sleeps up to 4. All utilities, cable, wifi, local calls, complimentary weekly linen service, toiletries, in-room coffee and tea supplies and the use of our gym and business center included in price. Complimentary housekeeping service is provided. 24 Hour Reception and Concierge Desk. Doorman.Laundry room.Supermarket across street. Approx 9.5% NYC taxes to be paid directly to host.

https://a0.muscache.com/pictures/ee54ff7c-c7fb-4b57-8ea6-427aed24f1e5.jpg

Beautiful One bedroom in Midtown Manhattan

200.0

Furnished one bedroom with private bath and  fully equipped kitchen in Midtown. 3 blocks from Grand Central Station. Sleeps up to 4.  All utilities, cable, wifi, local calls, complimentary weekly linen service, toiletries, in-room coffee and tea supplies and the use of our gym and business center included in price.Com

In [102]:
joblib.dump(knn_model, 'knn_model.pkl')

['knn_model.pkl']