In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

# Load the dataset
dataset1 = pd.read_csv("preprocessed_price.csv", index_col=None)
df2 = dataset1

# Encode categorical variables
df2 = pd.get_dummies(df2, drop_first=True)

# Feature and target separation
X = df2.drop('price', axis=1)
Y = df2['price']

# Function to select top k features
def selectkbest(X, Y, n):
    selector = SelectKBest(score_func=f_regression, k=n)
    X_new = selector.fit_transform(X, Y)
    return X_new

# Splitting and scaling data
def split_scalar(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# Regression model functions
def Linear(X_train, y_train, X_test):
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LinearRegression
    params = {'fit_intercept': [True, False]}
    grid = GridSearchCV(LinearRegression(), param_grid=params, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return r2_score(y_test, y_pred)

def svm_linear(X_train, y_train, X_test):
    from sklearn.svm import SVR
    params = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.1, 0.5, 1],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
    grid = GridSearchCV(SVR(), param_grid=params, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return r2_score(y_test, y_pred)


def Decision(X_train, y_train, X_test):
    from sklearn.tree import DecisionTreeRegressor

    params = {
        'max_depth': [5, 10, 20, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid=params, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return r2_score(y_test, y_pred)

def random(X_train, y_train, X_test):
    
    from sklearn.ensemble import RandomForestRegressor

    params = {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 20, 50, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=params, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return r2_score(y_test, y_pred)

# Select top 2 features
kbest = selectkbest(X, Y, 2)

# Splitting data
X_train, X_test, y_train, y_test = split_scalar(kbest, Y)

# Evaluate models
acclin = []
accsvml = []
accdes = []
accrf = []

for i in range(1):  # Loop is redundant here but kept for consistency
    r2_lin = Linear(X_train, y_train, X_test)
    acclin.append(r2_lin)

    r2_sl = svm_linear(X_train, y_train, X_test)
    accsvml.append(r2_sl)

   
    r2_d = Decision(X_train, y_train, X_test)
    accdes.append(r2_d)

    r2_r = random(X_train, y_train, X_test)
    accrf.append(r2_r)

# Selecting the best model
def selectk_regression(acclin, accsvml, accsvmnl, accdes, accrf):
    results = {
        "Linear Regression": np.mean(acclin),
        "SVM Linear": np.mean(accsvml),
        "Decision Tree": np.mean(accdes),
        "Random Forest": np.mean(accrf),
    }
    best_model = max(results, key=results.get)
    return best_model, results[best_model]

# Get the result
result = selectk_regression(acclin, accsvml, accsvmnl, accdes, accrf)

# Print the best model and its R2 score
print(f"The best model is {result[0]} with an R2 score of {result[1]:.2f}.")
