In [8]:
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('full data.csv') #place the path of the data

# Remove commas from 'Land Price (GHS)' and convert it to float
data['Land Price (GHS)'] = data['Land Price (GHS)'].replace({',': ''}, regex=True).astype(float)

# Separate features (Size, Distance) and target (Price)
X = data[['Size of Plot (sq. meters)', 'Distance from Airport (km)','Proximity to Main Road (km)','Proximity to City Center (km)',]].values
y = data['Land Price (GHS)'].values
# Normalize the features
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X_norm = (X - X_mean) / X_std
print(f"X_mean:{X_mean}")
print(f"X_std:{X_std}")
# Add a column of 1's for the bias term (theta_0)
m = len(y)  # number of training examples
X_b = np.c_[np.ones((m, 1)), X_norm]  # Add bias term (intercept)

# Define the cost function (Mean Squared Error)
def compute_cost(X_b, y, theta):
    m = len(y)
    predictions = X_b.dot(theta)  # Predictions based on current theta
    cost = (1/(2*m)) * np.sum((predictions - y) ** 2)
    return cost
# Define the gradient descent function
# Stochastic Gradient Descent function
start_time=time.Time()
def stochastic_gradient_descent(X_b, y, theta, alpha, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)
    
    for i in range(iterations):
        # Shuffle the data to avoid cycles
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = X_b[shuffled_indices]
        y_shuffled = y[shuffled_indices]
        
        for j in range(m):
            xi = X_b_shuffled[j:j+1]  # Single training example
            yi = y_shuffled[j:j+1]    # Corresponding target
            prediction = xi.dot(theta)  # Predict using current theta
            error = prediction - yi
            gradients = xi.T.dot(error)  # Compute gradient with the single example
            theta= theta - alpha * gradients  # Update theta
        
        # Store cost after each iteration over the entire dataset
        cost_history[i] = compute_cost(X_b, y, theta)
    
    return theta, cost_history
endtime=time.Time()
# Gradient Descent parameters
#alpha = 0.00359  # Learning rate
alpha = 0.2  # Learning rate
iterations = 1000  # Number of iterations
theta = np.zeros(X_b.shape[1])  # Initialize parameters (theta) with zeros

# Run Gradient Descent
theta_optimal, cost_history = stochastic_gradient_descent(X_b, y, theta, alpha, iterations)

# Print final theta values and final cost
print(f"Optimal theta: {theta_optimal}")
print(f"Final cost: {cost_history[-1]}")
print(f"xnorm: {X_norm}")

# Plot cost function history
plt.plot(range(iterations), cost_history, 'b-', label="Cost Function")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.title("Cost Function over Iterations")
plt.legend()
plt.show()

# Select only the "Size of Plot" as the feature for plotting (ignoring "Distance from Airport")
X_size = X_norm[:, 0]  # Normalized 'Size of Plot' column
theta_2d = theta_optimal[[0, 1]]  # Only use the intercept and theta for "Size of Plot"

# Plot the dataset
plt.scatter(X_size, y, color='blue', label="Actual Prices")

# Calculate the regression line (using optimal theta)
predicted_prices = X_b[:, [0, 1]].dot(theta_2d)  # Only use intercept and first feature (size)
plt.plot(X_size, predicted_prices, color='red', label="Regression Line")

# Adding labels and title
plt.xlabel("Normalized Size of Plot (sq. meters)")
plt.ylabel("Land Price (GHS)")
plt.title("Land Price vs Size of Plot with Regression Line")
plt.legend()
plt.show()
new_data = np.array([[600, 5, 10, 10]])  # Example size and distance

# Normalize the new input data
new_data_norm = (new_data - X_mean) / X_std
new_data_b = np.c_[np.ones((1, 1)), new_data_norm]  # Add bias term
print(f"newnorm:{new_data_norm}")
# Predict using the optimal theta
predicted_price = new_data_b.dot(theta_optimal)
print(f"Predicted land price: {predicted_price[0]}")
# Prediction
predictions = new_data_b.dot(theta_optimal)


X_mean:[365.     11.3     4.225  16.775]
X_std:[134.48977656   4.34568752   1.99671605   6.31956288]


NameError: name 'time' is not defined