In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load dataset
dataset1 = pd.read_csv("preprocessed_price.csv", index_col=None)
df2 = dataset1

# Encode categorical variables
df2 = pd.get_dummies(df2, drop_first=True)

# Feature and target separation
X = df2.drop('price', axis=1)
Y = df2['price']

# Select top k features
def selectkbest(X, Y, n):
    selector = SelectKBest(score_func=f_regression, k=n)
    X_new = selector.fit_transform(X, Y)
    selected_features = X.columns[selector.get_support()]
    print("Selected Features:", selected_features.tolist())
    return X_new

# Splitting and scaling data
def split_scalar(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# Train and return R2 score for each model
def train_model(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred), model

# Select top 2 features
kbest = selectkbest(X, Y, 2)
# Split data
X_train, X_test, y_train, y_test = split_scalar(kbest, Y)

# Train models and collect results
models = {
    "Linear Regression": LinearRegression(),
    "SVM Linear": SVR(kernel='poly'),
    "SVM Non-Linear": SVR(kernel='rbf'),
    "Decision Tree": DecisionTreeRegressor(random_state=30),
    "Random Forest": RandomForestRegressor(random_state=30, n_estimators=100)
}

results = {}
trained_models = {}

for name, model in models.items():
    r2, trained_model = train_model(model, X_train, y_train, X_test)
    results[name] = r2
    trained_models[name] = trained_model

# Find the best model
best_model_name = max(results, key=results.get)
print(best_model_name)
best_model = trained_models[best_model_name]
print(best_model)

print(f"The best model is {best_model_name} with an R2 score of {results[best_model_name]:.2f}.")

# Prediction example
#input_data = [1,419]  # Replace with the actual input for the model
#input_numpy_array = np.asarray(input_data)
#reshape = input_numpy_array.reshape(1, -1)

#prediction = best_model.predict(reshape)
#print(f"The predicted price is: {prediction[0]:.2f}")


Selected Features: ['total_sqft', 'bath']
Random Forest
RandomForestRegressor(random_state=30)
The best model is Random Forest with an R2 score of 0.55.


In [39]:
import pickle

In [40]:
filename="final_model.sav"

In [41]:
pickle.dump(model,open(filename,'wb'))

In [42]:
loaded_model=pickle.load(open("final_model.sav",'rb'))

In [43]:
result=loaded_model.predict([[2600,5]])

In [44]:
print(result)

[684.8]
