In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Content-Based Filtering
def content_based_filtering(df, keywords):
    # Concatenate the selected columns into a single text column
    selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
    df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)
    
    # Initialize TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Fit and transform the text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
    
    # Transform user input into a document
    user_document = ' '.join(keywords)
    user_vector = tfidf_vectorizer.transform([user_document])
    
    # Calculate cosine similarity between user input and house data
    cos_sim = cosine_similarity(user_vector, tfidf_matrix)
    
    # Get indices of houses sorted by relevance
    sorted_indices = cos_sim.argsort()[0][::-1]
    
    # Return top 5 most relevant houses
    top_recommendations = []
    for i in range(5):
        house_index = sorted_indices[i]
        relevance_score = cos_sim[0][house_index]
        recommendation = {
            'Category': df.iloc[house_index]['attributes.category_name'],
            'Subject': df.iloc[house_index]['attributes.subject'],
            'Body': df.iloc[house_index]['attributes.body'],
            'Region': df.iloc[house_index]['attributes.region_name'],
            'Property Type': df.iloc[house_index]['attributes.property_type_name'],
            'Relevance Score': relevance_score
        }
        top_recommendations.append(recommendation)
    return top_recommendations

# Collaborative Filtering
def collaborative_filtering(data, state):
    # Create User-Item Matrix
    user_item_matrix = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    
    # Initialize and fit NMF model
    num_components = 20  # Number of latent factors
    model = NMF(n_components=num_components, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    
    # Calculate RMSE
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    
    # Return reconstructed matrix and RMSE
    reconstructed_matrix = pd.DataFrame(np.round(X_reconstructed, 1), index=user_item_matrix.index, columns=user_item_matrix.columns)
    return reconstructed_matrix, rmse

# Hybrid Filtering
def hybrid_filtering(df, data, keywords, state):
    # Content-Based Filtering
    content_based_recommendations = content_based_filtering(df, keywords)
    
    # Collaborative Filtering
    reconstructed_matrix, rmse = collaborative_filtering(data, state)
    
    # Post-processing (e.g., combine recommendations, filter duplicates)
    # For simplicity, we'll just return both sets of recommendations
    return content_based_recommendations, reconstructed_matrix, rmse

# Load data
file_path_house = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
file_path_response = r"C:\Users\User\Desktop\response.csv"
df_house = pd.read_csv(file_path_house)
df_response = pd.read_csv(file_path_response)

# Get user input for keywords and state
keywords = input("Enter keywords to search for house: ").split()
state = input("Enter the state (Johor, Melaka, or Negeri Sembilan): ")

# Perform hybrid filtering
content_based_recommendations, reconstructed_matrix, rmse = hybrid_filtering(df_house, df_response, keywords, state)

# Display recommendations
print("\nContent-Based Recommendations:")
for i, recommendation in enumerate(content_based_recommendations, start=1):
    print(f"{i}. {recommendation}")

print("\nReconstructed Matrix:")
print(reconstructed_matrix)

print("\nRoot Mean Square Error (RMSE):", rmse)


Enter keywords to search for house: apartment
Enter the state (Johor, Melaka, or Negeri Sembilan): melaka

Content-Based Recommendations:
1. {'Category': 'apartment condominium', 'Subject': 'apartment sell', 'Body': 'selling furnished renovation price nego', 'Region': 'negeri sembilan', 'Property Type': 'apartment', 'Relevance Score': 0.7361519199382442}
2. {'Category': 'apartment condominium', 'Subject': 'apartment sale', 'Body': 'apartment sale cempaka court 3rd floor', 'Region': 'negeri sembilan', 'Property Type': 'apartment', 'Relevance Score': 0.7047959428563069}
3. {'Category': 'apartment condominium', 'Subject': 'ara ria apartment', 'Body': 'negotiable', 'Region': 'negeri sembilan', 'Property Type': 'apartment', 'Relevance Score': 0.6513482558543854}
4. {'Category': 'apartment condominium', 'Subject': 'apartment dahlia seremban', 'Body': 'apartment dahlia q sale 3rd floor non bumi lot bedroom ,1 bathroom ,1 toilet fully tile', 'Region': 'negeri sembilan', 'Property Type': 'apart

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Concatenate the selected columns into a single text column
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and trail data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of trails sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with RMSE
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse_melaka:.6f}")


Enter keywords to search for house: condo
Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subje

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Calculate RMSE for Johor
rmse_johor = np.sqrt(np.mean((X_johor - X_reconstructed_johor)**2))

# Concatenate the selected columns into a single text column for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and trail data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of trails sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE Johor: {rmse_johor:.6f}, RMSE Melaka: {rmse_melaka:.6f}, RMSE Negeri Sembilan: {rmse_negeri_sembilan:.6f}")


Enter keywords to search for house: 3 bedrooms
Combined Recommendations:
1. Category: house, Subject: taman desa bertam double storey sale, Body: taman desa bertam double storey sale bedrooms/bathrooms bed bath size x sqft ownership non bumi lot renovation detail basic unit renovation selling price rm 540k, Region: melaka, Property Type: 2-storey terraced house (Relevance Score: 0.24) RMSE Johor: 0.000080, RMSE Melaka: 0.000043, RMSE Negeri Sembilan: 0.000086
2. Category: house, Subject: 22x70sf double storey house sale tun aminah skudai full loan, Body: taman ungku tun aminah sale double storey bedrooms,3 bathroom land size 22x70sqft 1540sqft freehold non- bumi lot  call detail viewing appointment  alan lin , Region: johor, Property Type: 2-storey terraced house (Relevance Score: 0.21) RMSE Johor: 0.000080, RMSE Melaka: 0.000043, RMSE Negeri Sembilan: 0.000086
3. Category: house, Subject: seri alam jalan lembah 2.5 storey house sale, Body: sale jalan lembah seri alam 2.5 storey terrac

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and trail data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of trails sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Enter keywords to search for house: 3 bedrooms
Combined Recommendations:
1. Category: house, Subject: taman desa bertam double storey sale, Body: taman desa bertam double storey sale bedrooms/bathrooms bed bath size x sqft ownership non bumi lot renovation detail basic unit renovation selling price rm 540k, Region: melaka, Property Type: 2-storey terraced house (Relevance Score: 0.24) RMSE: 0.000043
2. Category: house, Subject: 22x70sf double storey house sale tun aminah skudai full loan, Body: taman ungku tun aminah sale double storey bedrooms,3 bathroom land size 22x70sqft 1540sqft freehold non- bumi lot  call detail viewing appointment  alan lin , Region: johor, Property Type: 2-storey terraced house (Relevance Score: 0.21) RMSE: 0.000080
3. Category: house, Subject: seri alam jalan lembah 2.5 storey house sale, Body: sale jalan lembah seri alam 2.5 storey terrace house land area 1,870 sqft bedrooms,4 bathroom freehold international lot gated guarded maintenance fee rm selling price

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]


# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")

Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  10000
Enter keywords to search for house:  terrace


Combined Recommendations:
1. Category: house, Subject: bukit baru storey terrace, Body: freehold, Region: melaka, Property Type: 2-storey terraced house (Relevance Score: 0.31) RMSE: 0.000043


NameError: name 'rmse_johor' is not defined

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]


# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")



Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  1000
Enter keywords to search for house:  terrace


Combined Recommendations:
1. Category: house, Subject: bukit baru storey terrace, Body: freehold, Region: melaka, Property Type: 2-storey terraced house (Relevance Score: 0.31) RMSE: 0.000043


NameError: name 'rmse_johor' is not defined

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Initialize default RMSE values
rmse_johor = 0.0
rmse_melaka = 0.0
rmse_negeri_sembilan = 0.0

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")



Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  10000
Enter keywords to search for house:  terrace


Combined Recommendations:
1. Category: house, Subject: bukit baru storey terrace, Body: freehold, Region: melaka, Property Type: 2-storey terraced house (Relevance Score: 0.31) RMSE: 0.000000
2. Category: house, Subject: kota masai single storey terrace terrace full loan, Body: single storey terrace house jalan bacang kota masai intermediate lot free hold 20x70 non bumi facing road bedroom bathroom selling price rm330k  alvin ng, Region: johor, Property Type: 1-storey terraced house (Relevance Score: 0.29) RMSE: 0.000000
3. Category: house, Subject: double story terrace freehold non bumi lot, Body: double storey terrace house beliza tiara sendayan property detail double storey terrace intermediate 2071sqft bedroom bathroom non bumi lot freehold brand new unit full loan asking price 410k, Region: negeri sembilan, Property Type: 2-storey terraced house (Relevance Score: 0.28) RMSE: 0.000000
4. Category: house, Subject: bukit beruang freehold storey terrace taman kerjasama, Body: bukit be

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")

Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Negeri Sembilan
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  1000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  10000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Calculate RMSE for Johor
rmse_johor = np.sqrt(np.mean((X_johor - X_reconstructed_johor)**2))

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  10000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Calculate RMSE for Johor
rmse_johor = np.sqrt(np.mean((X_johor - X_reconstructed_johor)**2))

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  1000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Calculate RMSE for Johor
rmse_johor = np.sqrt(np.mean((X_johor - X_reconstructed_johor)**2))

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ")
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  10000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Calculate RMSE for Johor
rmse_johor = np.sqrt(np.mean((X_johor - X_reconstructed_johor)**2))

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Prompt the user to input preferences
location = input("Enter your preferred location (Johor, Melaka, or Negeri Sembilan): ").lower()
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")



Enter your preferred location (Johor, Melaka, or Negeri Sembilan):  Johor
Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  1000
Enter keywords to search for house:  condo


Combined Recommendations:
1. Category: apartment condominium, Subject: cash back rm90k super value casa lago condo melaka raya, Body: rm208k introduction strategic location land area sqft bedroom bathroom leasehold non bumi lot nearby atlantis residence condo admiral residence condo wave residence condo ocean palm condo, Region: melaka, Property Type: condominium (Relevance Score: 0.64) RMSE: 0.000043
2. Category: apartment condominium, Subject: fully furnish room freehold condo bachang kampung lapan melaka, Body: best stay long term investment introduction strategic location land area sq.ft bedroom bathroom fully furnish non bumi lot freehold nearby atlantis residence condo wave residence condo kenanga residence condo admiral residence condo sri melaka residence condo sell lease contact z service derrickyeeproperty, Region: melaka, Property Type: service residence (Relevance Score: 0.62) RMSE: 0.000043
3. Category: apartment condominium, Subject: room condo sale, Body: 1room condo sal

In [49]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)


# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print user input keywords for debugging
print("User Input Keywords:", keywords)

# Extract region name from DataFrame
house_region = df.iloc[house_index]['attributes.region_name']

# Print region name extracted from DataFrame for debugging
print("Region Extracted from DataFrame:", house_region)

# Print DataFrame rows containing 'melaka'
print("Rows with 'negeri sembilan' region:")
print(df[df['attributes.region_name'] == 'negeri sembilan'])

# Print DataFrame rows containing 'melaka'
print("Rows with 'melaka' region:")
print(df[df['attributes.region_name'] == 'melaka'])

# Print DataFrame rows containing 'johor'
print("Rows with 'johor' region:")
print(df[df['attributes.region_name'] == 'johor'])

# Extract region name from DataFrame
house_region = df.iloc[house_index]['attributes.region_name']


# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    
    # Add the print statement to check the value of house_region
    print("House Region:", house_region)
    
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
        
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")


Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  12312
Enter keywords to search for house:  flat negeri sembilan


User Input Keywords: ['flat', 'negeri', 'sembilan']
Region Extracted from DataFrame: johor
Rows with 'negeri sembilan' region:
      attributes.category_name  \
15656                    house   
15657                    house   
15658                    house   
15659                    house   
15660                    house   
...                        ...   
25248                    house   
25249                    house   
25250                    house   
25251                    house   
25252                    house   

                                      attributes.subject  \
15656  banglo tingkat fasa 1d sq ft taman juasseh sen...   
15657  taman juasseh sentosa banglo tingkat sq ft fas...   
15658  fasa 1d banglo tingkat sq ft taman juasseh sen...   
15659  banglo tingkat sq ft fasa 1d taman juasseh sen...   
15660        open facing| nice view bandar seri sendayan   
...                                                  ...   
25248                          fully renovat

In [54]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)


# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print user input keywords for debugging
print("User Input Keywords:", keywords)

# Initialize a variable to store the house region
house_region = None

# Iterate through the sorted indices to find the first house with a matching region
for index in sorted_indices:
    current_region = df.iloc[index]['attributes.region_name']
    if current_region.lower() in unique_regions:
        house_region = current_region
        break

# Check if a matching region was found
if house_region:
    print("Region Extracted from DataFrame:", house_region)
else:
    print("No matching region found for the selected houses.")

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    
    # Extract region name from DataFrame
    house_region = df.iloc[house_index]['attributes.region_name']
    
    # Add the print statement to check the value of house_region
    print("House Region:", house_region)
    
    if house_region.lower() == 'johor':
        rmse = rmse_johor
    elif house_region.lower() == 'melaka':
        rmse = rmse_melaka
    elif house_region.lower() == 'negeri sembilan':
        rmse = rmse_negeri_sembilan
        
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")






Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  2
Enter your preferred price range(RM):  123123
Enter keywords to search for house:  flat melaka


User Input Keywords: ['flat', 'melaka']
Region Extracted from DataFrame: johor
Combined Recommendations:
House Region: johor
1. Category: apartment condominium, Subject: flat austin perdana, Body: high floor, Region: johor, Property Type: flat (Relevance Score: 0.59) RMSE: 0.000080
House Region: melaka
2. Category: apartment condominium, Subject: cheng ria flat sale, Body: near shop easy access highway good environment, Region: melaka, Property Type: flat (Relevance Score: 0.57) RMSE: 0.000043
House Region: johor
3. Category: apartment condominium, Subject: tun aminah low cost flat sale, Body: sale tun aminah flat blok flat low cost flat third floor bedroom bathroom 494sqft freehold basic renovated selling price rm150k, Region: johor, Property Type: flat (Relevance Score: 0.56) RMSE: 0.000080
House Region: johor
4. Category: apartment condominium, Subject: flat sale, Body: sale flat camar perling  level  r b  sq ft  non-bumi  freehold  balcony  fully tile  table top  cabinet harga rm16

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)


# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print user input keywords for debugging
print("User Input Keywords:", keywords)

# Initialize a list to store the house regions
house_regions = []

# Iterate through the sorted indices to find the region for each house
for index in sorted_indices:
    current_region = df.iloc[index]['attributes.region_name']
    if current_region.lower() in unique_regions:
        house_regions.append(current_region)
    else:
        house_regions.append(None)

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    
    # Extract region name from the list of house regions
    house_region = house_regions[i]
    
    if house_region:
        if house_region.lower() == 'johor':
            rmse = rmse_johor
        elif house_region.lower() == 'melaka':
            rmse = rmse_melaka
        elif house_region.lower() == 'negeri sembilan':
            rmse = rmse_negeri_sembilan
        
        print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f}) RMSE: {rmse:.6f}")
    else:
        print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: No matching region found, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f})")


KeyError: 'traffic\n.1'

In [69]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Create User-Item Matrix for Negeri Sembilan
user_item_matrix_negeri_sembilan = pd.pivot_table(data, values=['traffic\n.2', 'air_pollution\n.2', 'sound_pollution.2', 'crime_rate.2', 'overall_rating.2'], index='Username')

# Fill missing values with 0 for Negeri Sembilan
user_item_matrix_negeri_sembilan.fillna(0, inplace=True)

# Convert Negeri Sembilan user-item matrix to numpy array
X_negeri_sembilan = user_item_matrix_negeri_sembilan.values

# Initialize and fit NMF model for Negeri Sembilan
model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)
X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Calculate RMSE for Negeri Sembilan
rmse_negeri_sembilan = np.sqrt(np.mean((X_negeri_sembilan - X_reconstructed_negeri_sembilan)**2))

# Create User-Item Matrix for Johor
user_item_matrix_johor = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')

# Fill missing values with 0 for Johor
user_item_matrix_johor.fillna(0, inplace=True)

# Convert Johor user-item matrix to numpy array
X_johor = user_item_matrix_johor.values

# Initialize and fit NMF model for Johor
model_johor = NMF(n_components=num_components, init='random', random_state=20)
W_johor = model_johor.fit_transform(X_johor)  # Factorization of X_johor into W_johor and H_johor

# Reconstruction of X from the factorized matrices
X_reconstructed_johor = np.dot(W_johor, model_johor.components_)

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)

# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of properties sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Print user input keywords for debugging
print("User Input Keywords:", keywords)

# Print top 5 most relevant houses with combined results
print("Combined Recommendations:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    
    # Extract region name from DataFrame
    house_region = df.iloc[house_index]['attributes.region_name']
    
    # Check if the extracted region contains the user-input region
    if house_region.lower() in [region.lower() for region in unique_regions]:
        # Check which region the house belongs to and assign the corresponding RMSE
        if house_region.lower() == 'melaka':
            rmse = rmse_melaka
        elif house_region.lower() == 'negeri sembilan':
            rmse = rmse_negeri_sembilan
        elif house_region.lower() == 'johor':
            rmse = rmse_johor
        else:
            rmse = "N/A"  # If region not found, set RMSE as Not Available
        
        print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, RMSE: {rmse}")
    else:
        print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region} (Relevance Score: {relevance_score:.2f}")


Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  
Enter your preferred price range(RM):  
Enter keywords to search for house:  bungalow johor


User Input Keywords: ['bungalow', 'johor']
Combined Recommendations:
1. Category: house, Subject: bungalow banglo taman desa molek a++, Body: fully renovated bungalow beside river nice, Region: melaka, RMSE: 4.3213730497077075e-05
2. Category: house, Subject: freehold bungalow sale, Body:  house sale  krubong perdana  single storey bungalow  freehold  land size sqft  60x100 sqft  bedroom bathroom  facing southwest  fully renovated furnished, Region: melaka, RMSE: 4.3213730497077075e-05
3. Category: house, Subject: 1.5 storey bungalow seremban south taman senawang perdana seremban, Body: direct owner, Region: negeri sembilan, RMSE: 8.572222462193857e-05
4. Category: house, Subject: taman suria bungalow town jb johor double storey larkin sale, Body: sale taman suria johor double storey bungalow bedroom bathroom land size x built sqft facing southwest international lot freehold unblock view unit ~selling price rm 1.3 mil ~kindly contact m kimi 010-8859619, Region: johor, RMSE: 7.992878224

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)

# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Get user input for keywords
keywords_input = input("Enter keywords to search for house: ")
keywords = keywords_input.split(',')

# Filter the dataset to include only properties from the specified region
if 'johor' in unique_regions and 'johor' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'johor']
    X_region = X_melaka  # Using collaborative filtering data for Johor
elif 'melaka' in unique_regions and 'melaka' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'melaka']
    X_region = X_melaka  # Using collaborative filtering data for Melaka
elif 'negeri sembilan' in unique_regions and 'negeri sembilan' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'negeri sembilan']
    X_region = X_melaka  # Using collaborative filtering data for Negeri Sembilan
else:
    print("Region specified in the user input is not available in the dataset.")
    # You might want to handle this case differently based on your requirements
    exit()

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df_region['text'] = df_region[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix_region = tfidf_vectorizer.fit_transform(df_region['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data in the specified region
cos_sim_region = cosine_similarity(user_vector, tfidf_matrix_region)

# Get indices of properties in the specified region sorted by relevance
sorted_indices_region = cos_sim_region.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region
print("Content-Based Recommendations in the specified region:")
for i in range(5):
    house_index_region = sorted_indices_region[i]
    house_category_region = df_region.iloc[house_index_region]['attributes.category_name']
    house_subject_region = df_region.iloc[house_index_region]['attributes.subject']
    house_body_region = df_region.iloc[house_index_region]['attributes.body']
    relevance_score_region = cos_sim_region[0][house_index_region]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_region}, Subject: {house_subject_region}, Body: {house_body_region}, Relevance Score: {relevance_score_region:.2f}")

# Using collaborative filtering for hybrid recommendations
# Transform user input into the latent feature space
user_latent_representation = tfidf_vectorizer.transform([user_document])
user_latent_representation = model_melaka.transform(user_latent_representation)

# Calculate cosine similarity between user input and reconstructed properties data in the specified region
cos_sim_hybrid = cosine_similarity(user_latent_representation, X_reconstructed_region)

# Get indices of properties in the specified region sorted by relevance for hybrid filtering
sorted_indices_hybrid = cos_sim_hybrid.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region using hybrid filtering
print("Hybrid Recommendations in the specified region:")
for i in range(5):
    house_index_hybrid = sorted_indices_hybrid[i]
    house_category_hybrid = df_region.iloc[house_index_hybrid]['attributes.category_name']
    house_subject_hybrid = df_region.iloc[house_index_hybrid]['attributes.subject']
    house_body_hybrid = df_region.iloc[house_index_hybrid]['attributes.body']
    relevance_score_hybrid = cos_sim_hybrid[0][house_index_hybrid]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_hybrid}, Subject: {house_subject_hybrid}, Body: {house_body_hybrid}, Relevance Score: {relevance_score_hybrid:.2f}")


Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  
Enter your preferred price range(RM):  
Enter keywords to search for house:  flat,negeri sembilan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_region['text'] = df_region[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)


Content-Based Recommendations in the specified region:
1. Category: apartment condominium, Subject: flat bilik untuk dijual, Body: flat seri rasah bilik untuk dijual 59k berdekatan uitm s3 contact, Relevance Score: 0.59
2. Category: apartment condominium, Subject: flat rasah jaya ground floor, Body: flat rasah jaya seremban sale ground floor hot selling basic unit 100k call info 017-2256630 arasi, Relevance Score: 0.49
3. Category: apartment condominium, Subject: flat nibong sikamat n9 murah untuk dijual, Body: wts flat termurah taman bukit nibong sikamat seremban tenure freehold bumi bilik bilik air tingkat saiz sqft asking price rm75k nego, Relevance Score: 0.47
4. Category: apartment condominium, Subject: flat murah tingkat cempaka seremban utk dijual, Body: 2nd floor flat sale seremban lokasi cempaka b jalan s2 g5 garden avenue seremban seremban n.9 harga rm95,000 information interested buying selling property kindly contact mr chew, Relevance Score: 0.47
5. Category: apartment con

ValueError: X has 22292 features, but NMF is expecting 5 features as input.

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv("response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Reconstruction of X from the factorized matrices
X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)

# Calculate RMSE for Melaka
rmse_melaka = np.sqrt(np.mean((X_melaka - X_reconstructed_melaka)**2))

# Load data for content-based filtering
file_path = "mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)

# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Get user input for keywords
keywords_input = input("Enter keywords to search for house: ")
keywords = keywords_input.split(',')

# Filter the dataset to include only properties from the specified region
df_region = df[df['attributes.region_name'].str.lower().isin(keywords)]
X_region = X_melaka  # Using collaborative filtering data for Melaka

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df_region['text'] = df_region[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix_region = tfidf_vectorizer.fit_transform(df_region['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data in the specified region
cos_sim_region = cosine_similarity(user_vector, tfidf_matrix_region)

# Get indices of properties in the specified region sorted by relevance
sorted_indices_region = cos_sim_region.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region
print("Content-Based Recommendations in the specified region:")
for i in range(5):
    house_index_region = sorted_indices_region[i]
    house_category_region = df_region.iloc[house_index_region]['attributes.category_name']
    house_subject_region = df_region.iloc[house_index_region]['attributes.subject']
    house_body_region = df_region.iloc[house_index_region]['attributes.body']
    relevance_score_region = cos_sim_region[0][house_index_region]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_region}, Subject: {house_subject_region}, Body: {house_body_region}, Relevance Score: {relevance_score_region:.2f}")

# Using collaborative filtering for hybrid recommendations
# Transform user input into the latent feature space using the TF-IDF vectorizer
user_latent_representation = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and reconstructed properties data in the specified region
cos_sim_hybrid = cosine_similarity(user_latent_representation, X_region)

# Get indices of properties in the specified region sorted by relevance for hybrid filtering
sorted_indices_hybrid = cos_sim_hybrid.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region using hybrid filtering
print("Hybrid Recommendations in the specified region:")
for i in range(5):
    house_index_hybrid = sorted_indices_hybrid[i]
    house_category_hybrid = df_region.iloc[house_index_hybrid]['attributes.category_name']
    house_subject_hybrid = df_region.iloc[house_index_hybrid]['attributes.subject']
    house_body_hybrid = df_region.iloc[house_index_hybrid]['attributes.body']
    relevance_score_hybrid = cos_sim_hybrid[0][house_index_hybrid]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_hybrid}, Subject: {house_subject_hybrid}, Body: {house_body_hybrid}, Relevance Score: {relevance_score_hybrid:.2f}")


Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  
Enter your preferred price range(RM):  
Enter keywords to search for house:  johor,bungalow


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_region['text'] = df_region[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)


Content-Based Recommendations in the specified region:
1. Category: house, Subject: double storey bungalow taman universiti desa skudai, Body: bungalow desa skudai near taman universiti mutiara rini double storey bungalow freehold bumilot northwest land size 7,600 sqft 80x95 build 3,700 sqft room bathroom selling price rm 1.38m please contact jack, Relevance Score: 0.60
2. Category: house, Subject: bungalow taman daya sale non bumi lot, Body: sale bungalow non bumi lot sale bungalow land size sqft built size sqft renovation yes status leasehold non bumi lot asking price rm800k nego bank value match bedroom bathroom welcome call arrange viewing owner welcome list  kindly contact azuandy aradi 019-7133228, Relevance Score: 0.59
3. Category: house, Subject: taman suria bungalow town jb johor double storey larkin sale, Body: sale taman suria johor double storey bungalow bedroom bathroom land size x built sqft facing southwest international lot freehold unblock view unit ~selling price rm 1

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 19040 while Y.shape[1] == 5

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

# Load data for collaborative filtering
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Preprocess the data if necessary
# For example, handle missing values or clean the data

# Create User-Item Matrix for Melaka
user_item_matrix_melaka = pd.pivot_table(data, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')

# Fill missing values with 0 for Melaka
user_item_matrix_melaka.fillna(0, inplace=True)

# Convert Melaka user-item matrix to numpy array
X_melaka = user_item_matrix_melaka.values

# Initialize and fit NMF model for Melaka
num_components = 20  # Number of latent factors
model_melaka = NMF(n_components=num_components, init='random', random_state=20)
W_melaka = model_melaka.fit_transform(X_melaka)  # Factorization of X_melaka into W_melaka and H_melaka

# Load data for content-based filtering
file_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
df = pd.read_csv(file_path)

# Print unique regions for debugging
unique_regions = df['attributes.region_name'].unique()
print("Unique Regions:", unique_regions)

# Prompt the user to input preferences
bedroom_size = input("Enter your preferred number of bedroom: ")
price_range = input("Enter your preferred price range(RM): ")

# Get user input for keywords
keywords_input = input("Enter keywords to search for house: ")
keywords = keywords_input.split(',')

# Filter the dataset to include only properties from the specified region
# Filter the dataset to include only properties from the specified region
if 'johor' in unique_regions and 'johor' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'johor']
    # Create User-Item Matrix for Johor
    user_item_matrix_johor = pd.pivot_table(df_region, values=['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')
    # Fill missing values with 0 for Johor
    user_item_matrix_johor.fillna(0, inplace=True)

elif 'melaka' in unique_regions and 'melaka' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'melaka']
    X_region = X_melaka  # Using collaborative filtering data for Melaka
elif 'negeri sembilan' in unique_regions and 'negeri sembilan' in keywords_input.lower():
    df_region = df[df['attributes.region_name'].str.lower() == 'negeri sembilan']
    # Create User-Item Matrix for Negeri Sembilan
    user_item_matrix_negeri_sembilan = pd.pivot_table(df_region, values=['traffic\n.1', 'air_pollution\n.1', 'sound_pollution.1', 'crime_rate.1', 'overall_rating.1'], index='Username')
    # Fill missing values with 0 for Negeri Sembilan
    user_item_matrix_negeri_sembilan.fillna(0, inplace=True)
    # Convert Negeri Sembilan user-item matrix to numpy array
    X_negeri_sembilan = user_item_matrix_negeri_sembilan.values
    # Initialize and fit NMF model for Negeri Sembilan
    model_negeri_sembilan = NMF(n_components=num_components, init='random', random_state=20)
    W_negeri_sembilan = model_negeri_sembilan.fit_transform(X_negeri_sembilan)  # Factorization of X_negeri_sembilan into W_negeri_sembilan and H_negeri_sembilan
else:
    print("Region specified in the user input is not available in the dataset.")
    # You might want to handle this case differently based on your requirements
    exit()

# Concatenate the selected columns into a single text column for content-based filtering
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df_region['text'] = df_region[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize TfidfVectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix_region = tfidf_vectorizer.fit_transform(df_region['text'])

# Transform user input into a document
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and properties data in the specified region
cos_sim_region = cosine_similarity(user_vector, tfidf_matrix_region)

# Get indices of properties in the specified region sorted by relevance
sorted_indices_region = cos_sim_region.argsort()[0][::-1]

# Initialize variables for storing latent representations of properties in each region
X_reconstructed_johor = None
X_reconstructed_melaka = None
X_reconstructed_negeri_sembilan = None

# Reconstruct properties data in the specified region using NMF components
if 'johor' in unique_regions and 'johor' in keywords_input.lower():
    X_reconstructed_johor = np.dot(W_johor, model_johor.components_)
elif 'melaka' in unique_regions and 'melaka' in keywords_input.lower():
    X_reconstructed_melaka = np.dot(W_melaka, model_melaka.components_)
elif 'negeri sembilan' in unique_regions and 'negeri sembilan' in keywords_input.lower():
    X_reconstructed_negeri_sembilan = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_)

# Select the reconstructed properties data based on the specified region
X_reconstructed_region = X_reconstructed_johor if 'johor' in keywords_input.lower() else X_reconstructed_melaka if 'melaka' in keywords_input.lower() else X_reconstructed_negeri_sembilan

# Calculate cosine similarity between user input and reconstructed properties data in the specified region
cos_sim_hybrid = cosine_similarity(user_vector, X_reconstructed_region)

# Get indices of properties in the specified region sorted by relevance for hybrid filtering
sorted_indices_hybrid = cos_sim# Get indices of properties in the specified region sorted by relevance for hybrid filtering
sorted_indices_hybrid = cos_sim_hybrid.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region using hybrid filtering
print("Hybrid Recommendations in the specified region:")
for i in range(5):
    house_index_hybrid = sorted_indices_hybrid[i]
    house_category_hybrid = df_region.iloc[house_index_hybrid]['attributes.category_name']
    house_subject_hybrid = df_region.iloc[house_index_hybrid]['attributes.subject']
    house_body_hybrid = df_region.iloc[house_index_hybrid]['attributes.body']
    relevance_score_hybrid = cos_sim_hybrid[0][house_index_hybrid]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_hybrid}, Subject: {house_subject_hybrid}, Body: {house_body_hybrid}, Relevance Score: {relevance_score_hybrid:.2f}")

# Print top 5 most relevant houses in the specified region using content-based filtering
print("Content-Based Recommendations in the specified region:")
for i in range(5):
    house_index_region = sorted_indices_region[i]
    house_category_region = df_region.iloc[house_index_region]['attributes.category_name']
    house_subject_region = df_region.iloc[house_index_region]['attributes.subject']
    house_body_region = df_region.iloc[house_index_region]['attributes.body']
    relevance_score_region = cos_sim_region[0][house_index_region]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_region}, Subject: {house_subject_region}, Body: {house_body_region}, Relevance Score: {relevance_score_region:.2f}")

# Using collaborative filtering for hybrid recommendations
# Transform user input into the latent feature space using the TF-IDF vectorizer
user_latent_representation = tfidf_vectorizer.transform([user_document])

# Reconstruct properties data in the specified region using NMF components
X_reconstructed_region = np.dot(W_johor, model_johor.components_) if 'johor' in unique_regions and 'johor' in keywords_input.lower() else X_reconstructed_johor
X_reconstructed_region = np.dot(W_melaka, model_melaka.components_) if 'melaka' in unique_regions and 'melaka' in keywords_input.lower() else X_reconstructed_melaka
X_reconstructed_region = np.dot(W_negeri_sembilan, model_negeri_sembilan.components_) if 'negeri sembilan' in unique_regions and 'negeri sembilan' in keywords_input.lower() else X_reconstructed_negeri_sembilan

# Calculate cosine similarity between user input and reconstructed properties data in the specified region
cos_sim_hybrid = cosine_similarity(user_latent_representation, X_reconstructed_region)

# Get indices of properties in the specified region sorted by relevance for hybrid filtering
sorted_indices_hybrid = cos_sim_hybrid.argsort()[0][::-1]

# Print top 5 most relevant houses in the specified region using hybrid filtering
print("Hybrid Recommendations in the specified region:")
for i in range(5):
    house_index_hybrid = sorted_indices_hybrid[i]
    house_category_hybrid = df_region.iloc[house_index_hybrid]['attributes.category_name']
    house_subject_hybrid = df_region.iloc[house_index_hybrid]['attributes.subject']
    house_body_hybrid = df_region.iloc[house_index_hybrid]['attributes.body']
    relevance_score_hybrid = cos_sim_hybrid[0][house_index_hybrid]
    
    # Display the recommendation
    print(f"{i+1}. Category: {house_category_hybrid}, Subject: {house_subject_hybrid}, Body: {house_body_hybrid}, Relevance Score: {relevance_score_hybrid:.2f}")



Unique Regions: ['johor' 'melaka' 'negeri sembilan']


Enter your preferred number of bedroom:  
Enter your preferred price range(RM):  
Enter keywords to search for house:  johor,condo


KeyError: 'traffic'

In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read data from CSV file
file_path = r"C:\Users\User\Desktop\response.csv"
data = pd.read_csv(r"C:\Users\User\Desktop\response.csv")

# Display column names
print("Column Names:")
print(df.columns)

# Concatenate the selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
df = pd.read_csv(file_path)
df['text'] = df[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)



# Collaborative Filtering
def collaborative_filtering(data):
    # Preprocess the data if necessary
    # Create User-Item Matrix
    user_item_matrix = pd.pivot_table(data, values=['traffic\n', 'air_pollution\n', 'sound_pollution', 'crime_rate', 'overall_rating'], index='Username')
    # Fill missing values with 0
    user_item_matrix.fillna(0, inplace=True)
    # Convert to numpy array
    X = user_item_matrix.values
    # Initialize and fit NMF model
    num_components = 20  # Number of latent factors
    model = NMF(n_components=num_components, init='random', random_state=20)
    W = model.fit_transform(X)
    # Reconstruction of X from the factorized matrices
    X_reconstructed = np.dot(W, model.components_)
    return X_reconstructed

# Content-Based Filtering
def content_based_filtering(df, user_input):
    # Initialize TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()
    # Fit and transform the text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
    # Transform user input into a document
    user_document = ' '.join(user_input)
    user_vector = tfidf_vectorizer.transform([user_document])
    # Calculate cosine similarity between user input and properties data
    cos_sim = cosine_similarity(user_vector, tfidf_matrix)
    return cos_sim

# Hybrid Filtering
def hybrid_filtering(data, df, user_input):
    # Collaborative Filtering
    X_reconstructed = collaborative_filtering(data)
    # Content-Based Filtering
    cos_sim = content_based_filtering(df, user_input)
    # Combine recommendations from both methods
    hybrid_scores = (X_reconstructed + cos_sim) / 2
    # Get indices of properties sorted by relevance
    sorted_indices = hybrid_scores.argsort()[0][::-1]
    return sorted_indices

# Get user input for keywords
keywords = input("Enter keywords to search for house: ").split()

# Get hybrid recommendations
sorted_indices = hybrid_filtering(data, df, keywords)

# Print top 5 most relevant houses
print("Top 5 most relevant houses:")
for i in range(5):
    house_index = sorted_indices[i]
    house_category = df.iloc[house_index]['attributes.category_name']
    house_subject = df.iloc[house_index]['attributes.subject']
    house_body = df.iloc[house_index]['attributes.body']
    house_region = df.iloc[house_index]['attributes.region_name']
    house_property_type = df.iloc[house_index]['attributes.property_type_name']
    relevance_score = hybrid_scores[0][house_index]
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Relevance Score: {relevance_score:.2f})")


Column Names:
Index(['Timestamp', 'Username', 'Name', 'Gender', 'Marital Status',
       'Please select your age group', 'Occupation',
       'In which state are you currently staying, or have you been staying?',
       'Where do you live in Johor', 'traffic\n', 'air_pollution_johor\n',
       'sound_pollution_johor', 'crime_rate_johor', 'overall_rating_johor',
       'Review_johor', 'Where do you live in Melaka', 'traffic_melaka\n',
       'air_pollution_melaka\n', 'sound_pollution_melaka', 'crime_rate_melaka',
       'overall_rating_melaka', 'Review_melaka',
       'Where do you live in Negeri Sembilan', 'traffic_ns\n',
       'air_pollution_ns\n', 'sound_pollution_ns', 'crime_rate_ns',
       'overall_rating_ns', 'Review_ns'],
      dtype='object')


KeyError: "None of [Index(['attributes.category_name', 'attributes.subject', 'attributes.body',\n       'attributes.region_name', 'attributes.property_type_name'],\n      dtype='object')] are in the [columns]"

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# Concatenate selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
house_data['text'] = house_data[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# User input for keywords
keywords = input("Enter keywords to search for a house: ").split()

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(house_data['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Preprocess ratings data if necessary

# Function to calculate collaborative filtering scores
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    return 1 / (1 + rmse)

# Calculate collaborative filtering scores for each region
collab_scores = {}
for region in ['Johor', 'Melaka', 'Negeri Sembilan']:
    user_item_matrix = pd.pivot_table(ratings_data, values=[f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']], index='Username')
    collab_scores[region] = calculate_collab_score(user_item_matrix)

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for region, index in zip(['Johor', 'Melaka', 'Negeri Sembilan'], [sorted_indices[:5], sorted_indices[5:10], sorted_indices[10:15]]):
    for i, house_index in enumerate(index):
        content_score = cos_sim[0][house_index]
        collab_score = collab_scores[region]
        combined_score = (content_weight * content_score) + (collab_weight * collab_score)
        combined_scores[(region, i+1)] = {
            "House Index": house_index,
            "Combined Score": combined_score
        }

# Sort combined scores and print the top 5 houses
print("\nTop 5 Houses based on Hybrid Filtering:")
sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1]['Combined Score'], reverse=True)[:5]
for i, (region, house_data) in enumerate(sorted_combined_scores):
    house_index = house_data['House Index']
    house_category = house_data['attributes.category_name']
    house_subject = house_data['attributes.subject']
    house_body = house_data['attributes.body']
    house_region = house_data['attributes.region_name']
    house_property_type = house_data['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Combined Score: {house_data['Combined Score']:.2f})")


Enter keywords to search for a house:  melaka,flat


KeyError: 'traffic.johor'

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# Concatenate selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
house_data['text'] = house_data[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# User input for keywords
keywords = input("Enter keywords to search for a house: ").split()

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(house_data['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    return 1 / (1 + rmse)

# Calculate collaborative filtering scores for each region
collab_scores = {}
for region in ['Johor', 'Melaka', 'Negeri Sembilan']:
    metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
    if all(metric in ratings_data.columns for metric in metrics):
        user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
        collab_scores[region] = calculate_collab_score(user_item_matrix)

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for region, index in zip(['Johor', 'Melaka', 'Negeri Sembilan'], [sorted_indices[:5], sorted_indices[5:10], sorted_indices[10:15]]):
    for i, house_index in enumerate(index):
        content_score = cos_sim[0][house_index]
        if region in collab_scores:
            collab_score = collab_scores[region]
            combined_score = (content_weight * content_score) + (collab_weight * collab_score)
        else:
            combined_score = content_score
        combined_scores[(region, i+1)] = {
            "House Index": house_index,
            "Combined Score": combined_score
        }

# Sort combined scores and print the top 5 houses
print("\nTop 5 Houses based on Hybrid Filtering:")
sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1]['Combined Score'], reverse=True)[:5]
for i, (region, house_data) in enumerate(sorted_combined_scores):
    house_index = house_data['House Index']
    house_category = house_data['attributes.category_name']
    house_subject = house_data['attributes.subject']
    house_body = house_data['attributes.body']
    house_region = house_data['attributes.region_name']
    house_property_type = house_data['attributes.property_type_name']
    relevance_score = cos_sim[0][house_index]
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Combined Score: {house_data['Combined Score']:.2f})")


Enter keywords to search for a house:  johor



Top 5 Houses based on Hybrid Filtering:


KeyError: 'attributes.category_name'

In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# Concatenate selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
house_data['text'] = house_data[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# User input for keywords
keywords = input("Enter keywords to search for a house: ").split()

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(house_data['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords)
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    return 1 / (1 + rmse)

# Calculate collaborative filtering scores for each region
collab_scores = {}
for region in ['Johor', 'Melaka', 'Negeri Sembilan']:
    metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
    if all(metric in ratings_data.columns for metric in metrics):
        user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
        collab_scores[region] = calculate_collab_score(user_item_matrix)

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for region, index in zip(['Johor', 'Melaka', 'Negeri Sembilan'], [sorted_indices[:5], sorted_indices[5:10], sorted_indices[10:15]]):
    for i, house_index in enumerate(index):
        content_score = cos_sim[0][house_index]
        if region in collab_scores:
            collab_score = collab_scores[region]
            combined_score = (content_weight * content_score) + (collab_weight * collab_score)
        else:
            combined_score = content_score
        combined_scores[(region, i+1)] = {
            "House Index": house_index,
            "Combined Score": combined_score
        }

# Sort combined scores and print the top 5 houses
print("\nTop 15 Houses based on Hybrid Filtering:")
for i, (region, data) in enumerate(combined_scores.items()):
    house_index = data['House Index']
    house_category = house_data.iloc[house_index]['attributes.category_name']
    house_subject = house_data.iloc[house_index]['attributes.subject']
    house_body = house_data.iloc[house_index]['attributes.body']
    house_region = house_data.iloc[house_index]['attributes.region_name']
    house_property_type = house_data.iloc[house_index]['attributes.property_type_name']
    print(f"{i+1}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Combined Score: {data['Combined Score']:.2f})")


Enter keywords to search for a house:  apartment,johor



Top 15 Houses based on Hybrid Filtering:
1. Category: apartment condominium, Subject: apartment sell, Body: selling furnished renovation price nego, Region: negeri sembilan, Property Type: apartment (Combined Score: 0.61)
2. Category: apartment condominium, Subject: apartment sale, Body: apartment sale cempaka court 3rd floor, Region: negeri sembilan, Property Type: apartment (Combined Score: 0.58)
3. Category: apartment condominium, Subject: ara ria apartment, Body: negotiable, Region: negeri sembilan, Property Type: apartment (Combined Score: 0.54)
4. Category: apartment condominium, Subject: villa krystall room apartment, Body: bumi lot call, Region: johor, Property Type: apartment (Combined Score: 0.53)
5. Category: apartment condominium, Subject: bukit saujana apartment, Body: apartment saujana jb pahlawan bukit saujana apartment tingkat keluasan sq foot 3 room 2 bathroom, Region: johor, Property Type: apartment (Combined Score: 0.50)
6. Category: apartment condominium, Subject: 

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# User input for keywords and region
keywords_region = input("Enter keywords and region (e.g., 'apartment,johor'): ")
keywords, region = keywords_region.split(',')

# Filter houses from the specified region
region_filter = house_data['attributes.region_name'].str.lower() == region.strip().lower()
region_houses = house_data[region_filter]

# Concatenate selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
region_houses['text'] = region_houses[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(region_houses['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords.split())
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    return 1 / (1 + rmse)

# Calculate collaborative filtering scores for the specified region
collab_scores = {}
metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
if all(metric in ratings_data.columns for metric in metrics):
    user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
    collab_scores[region] = calculate_collab_score(user_item_matrix)

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for i, house_index in enumerate(sorted_indices[:15]):
    content_score = cos_sim[0][house_index]
    if region in collab_scores:
        collab_score = collab_scores[region]
        combined_score = (content_weight * content_score) + (collab_weight * collab_score)
    else:
        combined_score = content_score
    combined_scores[i+1] = {
        "House Index": house_index,
        "Combined Score": combined_score
    }

# Sort combined scores and print the top 15 houses
print("\nTop 15 Houses based on Hybrid Filtering:")
for i, data in combined_scores.items():
    house_index = data['House Index']
    house_category = region_houses.iloc[house_index]['attributes.category_name']
    house_subject = region_houses.iloc[house_index]['attributes.subject']
    house_body = region_houses.iloc[house_index]['attributes.body']
    house_region = region_houses.iloc[house_index]['attributes.region_name']
    house_property_type = region_houses.iloc[house_index]['attributes.property_type_name']
    print(f"{i}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Combined Score: {data['Combined Score']:.2f})")


Enter keywords and region (e.g., 'apartment,johor'):  flat,melaka


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_houses['text'] = region_houses[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)



Top 15 Houses based on Hybrid Filtering:
1. Category: apartment condominium, Subject: cheng ria flat sale, Body: near shop easy access highway good environment, Region: melaka, Property Type: flat (Combined Score: 0.62)
2. Category: apartment condominium, Subject: banda hilir flat sale, Body: banda hilir flat sale view please contact 014-3384930, Region: melaka, Property Type: flat (Combined Score: 0.56)
3. Category: apartment condominium, Subject: 3rd floor taman merdeka jaya flat batu berendam melaka, Body: flat taman merdeka jaya melaka sale rm110k level sqft bedroom bathroom leasehold non bumi chester property sdn bhd 016-666 raymond 010-277 lawrence ren59994, Region: melaka, Property Type: flat (Combined Score: 0.52)
4. Category: apartment condominium, Subject: malim flat good invesment rental go, Body: sale malim flat lvl lh 65+- 668sqft 3bedroom 1bathroom rm88k, Region: melaka, Property Type: flat (Combined Score: 0.49)
5. Category: apartment condominium, Subject: nearby infine

In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# User input for keywords and region
keywords_region = input("Enter keywords and region (e.g., 'apartment,johor'): ")
keywords, region = keywords_region.split(',')

# Filter houses from the specified region
region_filter = house_data['attributes.region_name'].str.lower() == region.strip().lower()
region_houses = house_data[region_filter]

# Concatenate selected columns into a single text column
selected_columns = ['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']
region_houses['text'] = region_houses[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(region_houses['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords.split())
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores and RMSE
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    collab_score = 1 / (1 + rmse)
    return collab_score, rmse

# Calculate collaborative filtering scores and RMSE for the specified region
collab_scores = {}
metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
if all(metric in ratings_data.columns for metric in metrics):
    user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
    collab_scores[region], rmse = calculate_collab_score(user_item_matrix)
    print(f"RMSE for {region}: {rmse:.2f}")

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for i, house_index in enumerate(sorted_indices[:15]):
    content_score = cos_sim[0][house_index]
    if region in collab_scores:
        collab_score = collab_scores[region]
        combined_score = (content_weight * content_score) + (collab_weight * collab_score)
    else:
        combined_score = content_score
    combined_scores[i+1] = {
        "House Index": house_index,
        "Combined Score": combined_score
    }

# Sort combined scores and print the top 15 houses
print("\nTop 15 Houses based on Hybrid Filtering:")
for i, data in combined_scores.items():
    house_index = data['House Index']
    house_category = region_houses.iloc[house_index]['attributes.category_name']
    house_subject = region_houses.iloc[house_index]['attributes.subject']
    house_body = region_houses.iloc[house_index]['attributes.body']
    house_region = region_houses.iloc[house_index]['attributes.region_name']
    house_property_type = region_houses.iloc[house_index]['attributes.property_type_name']
    print(f"{i}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Combined Score: {data['Combined Score']:.2f})")


Enter keywords and region (e.g., 'apartment,johor'):  flat,negeri sembilan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_houses['text'] = region_houses[selected_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)



Top 15 Houses based on Hybrid Filtering:
1. Category: apartment condominium, Subject: flat bilik untuk dijual, Body: flat seri rasah bilik untuk dijual 59k berdekatan uitm s3 contact, Region: negeri sembilan, Property Type: flat (Combined Score: 0.59)
2. Category: apartment condominium, Subject: flat rasah jaya ground floor, Body: flat rasah jaya seremban sale ground floor hot selling basic unit 100k call info 017-2256630 arasi, Region: negeri sembilan, Property Type: flat (Combined Score: 0.50)
3. Category: apartment condominium, Subject: flat nibong sikamat n9 murah untuk dijual, Body: wts flat termurah taman bukit nibong sikamat seremban tenure freehold bumi bilik bilik air tingkat saiz sqft asking price rm75k nego, Region: negeri sembilan, Property Type: flat (Combined Score: 0.48)
4. Category: apartment condominium, Subject: flat murah tingkat cempaka seremban utk dijual, Body: 2nd floor flat sale seremban lokasi cempaka b jalan s2 g5 garden avenue seremban seremban n.9 harga rm9

In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# User input for keywords and region
keywords_region = input("Enter keywords and region (e.g., 'apartment,johor'): ")
keywords, region = keywords_region.split(',')

# Filter houses from the specified region
region_filter = house_data['attributes.region_name'].str.lower() == region.strip().lower()
region_houses = house_data[region_filter]

# Concatenate selected columns into a single text column
region_houses['text'] = region_houses[['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(region_houses['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords.split())
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores and RMSE
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    collab_score = 1 / (1 + rmse)
    return collab_score, rmse

# Calculate collaborative filtering scores and RMSE for the specified region
collab_scores = {}
metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
if all(metric in ratings_data.columns for metric in metrics):
    user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
    collab_scores[region], rmse = calculate_collab_score(user_item_matrix)
    print(f"Relevant Score: {cos_sim.max().max():.2f}, RMSE: {rmse:.2f}")

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for i, house_index in enumerate(sorted_indices[:15]):
    content_score = cos_sim[0][house_index]
    if region in collab_scores:
        collab_score = collab_scores[region]
        combined_score = (content_weight * content_score) + (collab_weight * collab_score)
    else:
        combined_score = content_score
    combined_scores[i+1] = {
        "House Index": house_index,
        "Content Score": content_score,
        "Combined Score": combined_score
    }

# Sort combined scores and print the top 15 houses
print("\nTop 15 Houses based on Hybrid Filtering:")
for i, data in combined_scores.items():
    house_index = data['House Index']
    house_category = region_houses.iloc[house_index]['attributes.category_name']
    house_subject = region_houses.iloc[house_index]['attributes.subject']
    house_body = region_houses.iloc[house_index]['attributes.body']
    house_region = region_houses.iloc[house_index]['attributes.region_name']
    house_property_type = region_houses.iloc[house_index]['attributes.property_type_name']
    print(f"{i}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Content Score: {data['Content Score']:.2f}, Combined Score: {data['Combined Score']:.2f})")


Enter keywords and region (e.g., 'apartment,johor'):  flat,johor


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_houses['text'] = region_houses[['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']].fillna('').apply(lambda x: ' '.join(x), axis=1)



Top 15 Houses based on Hybrid Filtering:
1. Category: apartment condominium, Subject: flat austin perdana, Body: high floor, Region: johor, Property Type: flat (Content Score: 0.68, Combined Score: 0.68)
2. Category: apartment condominium, Subject: tun aminah low cost flat sale, Body: sale tun aminah flat blok flat low cost flat third floor bedroom bathroom 494sqft freehold basic renovated selling price rm150k, Region: johor, Property Type: flat (Content Score: 0.65, Combined Score: 0.65)
3. Category: apartment condominium, Subject: skudai flat taman ungku tun aminah sale, Body: sale flat tun aminah low cost flat highest floor build sqft bedroom bathroom freehold non bumi selling price rm135k nego, Region: johor, Property Type: flat (Content Score: 0.61, Combined Score: 0.61)
4. Category: apartment condominium, Subject: flat sale, Body: sale flat camar perling  level  r b  sq ft  non-bumi  freehold  balcony  fully tile  table top  cabinet harga rm160k , Region: johor, Property Type: f

In [31]:
# Load house data
house_data_path = r"C:\Users\User\Desktop\dataset\mudah_all_dataV2.csv"
house_data = pd.read_csv(house_data_path)

# User input for keywords and region
keywords_region = input("Enter keywords and region (e.g., 'apartment,johor'): ")
keywords, region = keywords_region.split(',')

# Filter houses from the specified region
region_filter = house_data['attributes.region_name'].str.lower() == region.strip().lower()
region_houses = house_data[region_filter]

# Concatenate selected columns into a single text column
region_houses['text'] = region_houses[['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(region_houses['text'])

# Transform user input into a document vector
user_document = ' '.join(keywords.split())
user_vector = tfidf_vectorizer.transform([user_document])

# Calculate cosine similarity between user input and house data
cos_sim = cosine_similarity(user_vector, tfidf_matrix)

# Get indices of houses sorted by relevance
sorted_indices = cos_sim.argsort()[0][::-1]

# Load user ratings data
ratings_data_path = r"C:\Users\User\Desktop\response.csv"
ratings_data = pd.read_csv(ratings_data_path)

# Function to calculate collaborative filtering scores and RMSE
def calculate_collab_score(user_item_matrix):
    user_item_matrix.fillna(0, inplace=True)
    X = user_item_matrix.values
    model = NMF(n_components=20, init='random', random_state=20)
    W = model.fit_transform(X)
    X_reconstructed = np.dot(W, model.components_)
    rmse = np.sqrt(np.mean((X - X_reconstructed)**2))
    collab_score = 1 / (1 + rmse)
    return collab_score, rmse

# Calculate collaborative filtering scores and RMSE for the specified region
collab_scores = {}
metrics = [f'{metric}.{region.lower()}' for metric in ['traffic', 'air_pollution', 'sound_pollution', 'crime_rate', 'overall_rating']]
if all(metric in ratings_data.columns for metric in metrics):
    user_item_matrix = pd.pivot_table(ratings_data, values=metrics, index='Username')
    collab_scores[region], rmse = calculate_collab_score(user_item_matrix)
    print(f"Relevant Score: {cos_sim.max().max():.2f}, RMSE: {rmse:.2f}")

# Weight for each method (content-based and collaborative)
content_weight = 0.7
collab_weight = 0.3

# Combine scores for each house
combined_scores = {}
for i, house_index in enumerate(sorted_indices[:15]):
    content_score = cos_sim[0][house_index]
    if region in collab_scores:
        collab_score = collab_scores[region]
        combined_score = (content_weight * content_score) + (collab_weight * collab_score)
    else:
        combined_score = content_score
    combined_scores[i+1] = {
        "House Index": house_index,
        "Content Score": content_score,
        "Combined Score": combined_score
    }

# Sort combined scores and print the top 15 houses
print("\nTop 15 Houses based on Hybrid Filtering:")
for i, data in combined_scores.items():
    house_index = data['House Index']
    house_category = region_houses.iloc[house_index]['attributes.category_name']
    house_subject = region_houses.iloc[house_index]['attributes.subject']
    house_body = region_houses.iloc[house_index]['attributes.body']
    house_region = region_houses.iloc[house_index]['attributes.region_name']
    house_property_type = region_houses.iloc[house_index]['attributes.property_type_name']
    print(f"{i}. Category: {house_category}, Subject: {house_subject}, Body: {house_body}, Region: {house_region}, Property Type: {house_property_type} (Content Score: {data['Content Score']:.2f}, Combined Score: {data['Combined Score']:.2f})")


Enter keywords and region (e.g., 'apartment,johor'):  bungalow,negeri sembilan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_houses['text'] = region_houses[['attributes.category_name', 'attributes.subject', 'attributes.body', 'attributes.region_name', 'attributes.property_type_name']].fillna('').apply(lambda x: ' '.join(x), axis=1)



Top 15 Houses based on Hybrid Filtering:
1. Category: house, Subject: 1.5 storey bungalow seremban south taman senawang perdana seremban, Body: direct owner, Region: negeri sembilan, Property Type: bungalow house (Content Score: 0.53, Combined Score: 0.53)
2. Category: house, Subject: bungalow lavender height, Body: sq.ft| http //www.wasap.my/60182222293, Region: negeri sembilan, Property Type: bungalow house (Content Score: 0.48, Combined Score: 0.48)
3. Category: house, Subject: bungalow house sale seremban rasah, Body: bungalow sale seremban rasah storey bungalow big land space freehold land size x 7,760 sqft room bathroom extra land back side renovated price rm 730k detail kindly contact eric teh 016-360, Region: negeri sembilan, Property Type: bungalow house (Content Score: 0.44, Combined Score: 0.44)
4. Category: land, Subject: bungalow land rasah kemayan seremban2, Body: bungalow land sale size sq meter, Region: negeri sembilan, Property Type: residential (Content Score: 0.44, 