# Dataset Cleaning

In [17]:
import pandas as pd
import numpy as np

#========================================================

df=pd.read_csv(r"C:\Users\Kuria\OneDrive\Documents\IO Project\house_price_prediction.csv")

df.head()
print(df.isnull().sum())

critical_cols = ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'abc', 'Price']
df[critical_cols] = df[critical_cols].replace(r'^\s*$', np.nan, regex=True) # Replace empty strings or spaces with NaN first
df = df.dropna(subset=critical_cols) # Now drop rows with missing critical values

df.drop_duplicates() 

df['Area'] = pd.to_numeric(df['Area'], errors='coerce')      #converts all data in the clms to numeric and Nan where the conversion is not possible
df['Bedrooms'] = pd.to_numeric(df['Bedrooms'], errors='coerce')
df['Bathrooms'] = pd.to_numeric(df['Bathrooms'], errors='coerce')
df['Floors'] = pd.to_numeric(df['Floors'], errors='coerce')
df['abc'] = pd.to_numeric(df['abc'], errors='coerce')
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

df = df.dropna(subset=['Area', 'Bedrooms', 'Floors', 'Price']) #drops any Nan values that came up from previous conversion

#fills any missing values
df['Area'].fillna(df['Area'].median(), inplace=True)
df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace=True)
df['Bathrooms'].fillna(df['Bathrooms'].median(), inplace=True)
df['Floors'].fillna(df['Floors'].median(), inplace=True)
df['Price'].fillna(df['Price'].median(), inplace=True)

print()
print()
df.head()

Id           0
Area         0
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
abc          0
Price        0
dtype: int64




Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,abc,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,18,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,4,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,15,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,3,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,22,636056


# Coulmn Encoding and Feature Engineering

In [18]:
current_year = 2025 # Use the current year to calculate age of house
df['House_Age'] = current_year - df['YearBuilt']
df.drop('YearBuilt', axis=1, inplace=True)

df.drop(['abc','Id','Bathrooms','Condition', 'Garage'], axis=1, inplace=True)



In [19]:
from sklearn.preprocessing import OneHotEncoder

# Columns to encode
categorical_cols = ["Location"] #, "Condition", "Garage"]

# Create encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit + transform
encoded = encoder.fit_transform(df[categorical_cols])

# Convert to DataFrame
encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(categorical_cols)
)

# Drop original categorical columns
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

print(df_encoded.head())

import joblib
joblib.dump(encoder, "encoder.pkl")



   Area  Bedrooms  Floors   Price  House_Age  Location_Downtown  \
0  1360         5       3  149919         55                1.0   
1  4272         5       3  424998         67                1.0   
2  3592         2       3  266746         87                1.0   
3   966         4       2  244020        123                0.0   
4  4926         1       2  636056         50                1.0   

   Location_Rural  Location_Suburban  Location_Urban  
0             0.0                0.0             0.0  
1             0.0                0.0             0.0  
2             0.0                0.0             0.0  
3             0.0                1.0             0.0  
4             0.0                0.0             0.0  


['encoder.pkl']

In [21]:
df.head()
df.to_csv('cleaned8_house_price_prediction.csv', index=False)


# Training & Testing Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# 1. LOAD YOUR CSV
df = pd.read_csv(r"C:\Users\Kuria\OneDrive\Documents\IO Project\cleaned8_house_price_prediction.csv")

# 2. SELECT FEATURES & TARGET
# Change these to match your actual columns
target = "Price"    # <-- your target column
X = df.drop(columns=[target])
y = df[target]

# 3. IDENTIFY NUMERIC & CATEGORICAL COLUMNS
numeric_cols = ['Area', 'Bedrooms','Floors', 'House_Age']
categorical_cols = ['Location']

print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)

# 4. PREPROCESSING (One-Hot Encoding + Scaling)

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# 5. DEFINE SVM MODEL

svm_model = SVR(kernel="rbf", C=100, gamma="scale")  
# kernel='rbf' works best in most regression tasks

# 6. MAKE A PIPELINE
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("svm", svm_model)
])

# 7. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8. TRAIN
model.fit(X_train, y_train)

# 9. PREDICT
y_pred = model.predict(X_test)

# 10. EVALUATE
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RÂ² Score:", r2)

