In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/housing-prices-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/housing-prices-dataset?dataset_version_number=1...


100%|██████████| 4.63k/4.63k [00:00<00:00, 2.37MB/s]

Extracting files...
Path to dataset files: C:\Users\26har\.cache\kagglehub\datasets\yasserh\housing-prices-dataset\versions\1





In [3]:
# Import shutil library for file operations
import shutil

# Define source and destination paths for dataset
source = "/root/.cache/kagglehub/datasets/msambare/fer2013/versions/1"
destination = "./dataset"

# Move dataset files from source to destination folder (display where moved to for referrence)
shutil.move(path, destination)
print(f"Files moved to {destination}")

Files moved to ./dataset


### Import Required Libraries
We import pandas to handle data manipulation and analysis.

### Load the Dataset
We use `pd.read_csv` to read the `Housing.csv` file located in the `dataset` folder and load it into a DataFrame (`df`).

### Preview the Dataset
The `head()` method is used to display the first 5 rows of the dataset for a quick overview of its structure and contents.


In [4]:
import pandas as pd

df = pd.read_csv("./dataset/Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### View Column Names
The `columns` attribute is used to list all the column names in the dataset. This helps to understand the structure and features available in the dataset.


In [5]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

### Check Dataset Dimensions
The `shape` attribute returns the dimensions of the dataset as a tuple `(rows, columns)`, indicating the number of rows and columns in the DataFrame.

In [7]:
df.shape

(545, 13)

### Check Data Types
The `dtypes` attribute displays the data type of each column in the dataset, helping to understand the type of data stored in each feature.


In [8]:
df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

### Check for Missing Values
The `isnull()` method returns a DataFrame of the same shape as `df`, where each entry is `True` if the value is missing (`NaN`) and `False` otherwise. This helps identify missing data in the dataset.


In [9]:
df.isnull()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,False,False,False,False,False,False,False,False,False,False,False,False,False
541,False,False,False,False,False,False,False,False,False,False,False,False,False
542,False,False,False,False,False,False,False,False,False,False,False,False,False
543,False,False,False,False,False,False,False,False,False,False,False,False,False


### Summarize Missing Values
The `isnull().sum()` method calculates the total number of missing values (`NaN`) in each column. This provides a quick overview of where data is missing in the dataset.


In [10]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

### Drop Columns with Excessive Missing Values
If a column has more than 50% missing values, it can be dropped using the `drop()` method. 

- `df.drop("column_name", axis=1)`:
  - `"column_name"`: Replace with the name of the column to be dropped.
  - `axis=1`: Specifies that a column (not a row) is being dropped.
  - The resulting DataFrame will exclude the specified column.


In [None]:
# If the missing values are more than 50% of the actual data then we can drop the column like this
# df = df.drop("column_name",axis = 1)

### Handle Columns with Less Than 50% Missing Values
If missing values in a column are less than 50%, we can handle them by:

1. **Filling with the Mean**:
   - Calculate the mean of the column using `df["column_name"].mean()`.
   - Use `fillna()` to replace missing values with the calculated mean.
   - Example:
     - `mean = round(df["column_name"].mean(), 0)`: Calculates and rounds the mean to the nearest integer.
     - `df["column_name"] = df["column_name"].fillna(value=mean)`: Replaces missing values with the mean.

2. **Dropping Rows**:
   - Alternatively, we can drop rows with missing values using `df.dropna(subset=["column_name"], inplace=True)`. 


In [None]:
# If the missing values are less then 50% of the actual data then we can either drop those rows or assign them mean values
# mean = round(df["column_name"].mean(),0)
# df["column_name"] = df["column_name"].fillna(value = mean)

In [17]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### Encode Categorical Variables:

In [21]:
df["mainroad"].unique()

array(['yes', 'no'], dtype=object)

In [22]:
df["guestroom"].unique()

array(['no', 'yes'], dtype=object)

In [23]:
df["basement"].unique()

array(['no', 'yes'], dtype=object)

In [24]:
df["hotwaterheating"].unique()

array(['no', 'yes'], dtype=object)

In [25]:
df["airconditioning"].unique()

array(['yes', 'no'], dtype=object)

In [26]:
df["prefarea"].unique()

array(['yes', 'no'], dtype=object)

In [27]:
df["furnishingstatus"].unique()

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

### Encode Categorical Variables
The `pd.get_dummies()` function is used to convert categorical variables into dummy/indicator variables.

- `df_encoded = pd.get_dummies(df, drop_first=False)`:
  - Creates dummy variables for all categories of each categorical column.
  - The `drop_first=False` parameter ensures that all categories are encoded, including the first one.

### Impact of `drop_first`
- **When `drop_first=True`**:
  - One dummy variable is dropped for each categorical column to avoid the "dummy variable trap," which occurs when the dummy variables are perfectly collinear with each other.
  - Recommended when using regression models to prevent multicollinearity.
  
- **When `drop_first=False`**:
  - All categories are included, which might be useful for tree-based models (e.g., decision trees, random forests) as they don't rely on linearity or collinearity issues.

### Which to Prefer?
- Use **`drop_first=True`** for regression-based models to reduce redundancy.
- Use **`drop_first=False`** for tree-based or non-linear models where multicollinearity is not an issue. 


In [39]:
df = pd.get_dummies(df, drop_first=True)

In [40]:
df.tail()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
540,1820000,3000,2,1,1,2,True,False,True,False,False,False,False,True
541,1767150,2400,3,1,1,0,False,False,False,False,False,False,True,False
542,1750000,3620,2,1,1,0,True,False,False,False,False,False,False,True
543,1750000,2910,3,1,1,0,False,False,False,False,False,False,False,False
544,1750000,3850,3,1,2,0,True,False,False,False,False,False,False,True


### Splitting Features and Target Variable
1. **Separate Features (X)**:
   - `x = df.drop('price', axis=1)`:
     - Drops the `price` column from the DataFrame.
     - The remaining columns are stored in `x` as the features for model training.

2. **Separate Target (y)**:
   - `y = df['price']`:
     - Extracts the `price` column from the DataFrame.
     - This column represents the target variable we aim to predict.

### Purpose
This step is essential to split the data into independent variables (`x`) and the dependent variable (`y`) for machine learning models.


In [42]:
from sklearn.model_selection import train_test_split

x = df.drop('price', axis=1)
y = df['price']

In [44]:
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [45]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

### Split Data into Training and Testing Sets
1. **Splitting Data**:
   - `train_test_split` from `sklearn.model_selection` is used to split the data into training and testing sets.
   - Parameters:
     - `x` and `y`: Features and target variable.
     - `test_size=0.2`: Reserves 20% of the data for testing.
     - `random_state=42`: Ensures reproducibility by controlling the randomness of the split.

2. **Output Variables**:
   - `X_train` and `y_train`: Training features and target data.
   - `X_test` and `y_test`: Testing features and target data.

3. **Check Shapes**:
   - `X_train.shape` and `y_train.shape`: Dimensions of training features and target.
   - `X_test.shape` and `y_test.shape`: Dimensions of testing features and target.

### Purpose
Splitting the data ensures that the model is trained on one subset of the data (80%) and evaluated on a separate, unseen subset (20%), enabling an unbiased performance assessment.


In [46]:
# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Check the shapes of the splits
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Training Target Shape:", y_train.shape)
print("Test Target Shape:", y_test.shape)

Training Features Shape: (436, 13)
Test Features Shape: (109, 13)
Training Target Shape: (436,)
Test Target Shape: (109,)


### Scale Features Using StandardScaler
1. **Standardization**:
   - Standardization scales the features so that they have a mean of 0 and a standard deviation of 1.
   - This is essential for algorithms sensitive to feature magnitudes, such as logistic regression, SVMs, and neural networks.

2. **Initialize the Scaler**:
   - `scaler = StandardScaler()`: Creates an instance of the `StandardScaler`.

3. **Fit and Transform Training Data**:
   - `X_train_scaled = scaler.fit_transform(X_train)`:
     - `fit_transform()` calculates the mean and standard deviation from the training data and applies the transformation.
     - Ensures that the scaling parameters are derived only from the training data to avoid data leakage.

4. **Transform Test Data**:
   - `X_test_scaled = scaler.transform(X_test)`:
     - `transform()` applies the same scaling parameters learned from the training data to the test data.

### Purpose
Scaling ensures that all features contribute equally to the model, preventing those with larger magnitudes from dominating the results. Always fit the scaler on training data and use it to transform both training and testing datasets to maintain consistency.


In [50]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Train and Evaluate a Linear Regression Model
1. **Import Required Classes**:
   - `LinearRegression`: Used to create and train a linear regression model.
   - `mean_squared_error`: Used to evaluate the model's performance using the Mean Squared Error (MSE).

2. **Train the Model**:
   - `model = LinearRegression()`: Instantiates a linear regression model.
   - `model.fit(X_train_scaled, y_train)`: Fits the model to the scaled training data.

3. **Make Predictions on Training Data**:
   - `y_train_pred = model.predict(X_train_scaled)`: Predicts target values for the training set using the trained model.

4. **Evaluate the Model**:
   - `mean_squared_error(y_train, y_train_pred)`: Calculates the Mean Squared Error (MSE) between actual (`y_train`) and predicted values (`y_train_pred`).
   - RMSE is the square root of MSE, but here the raw MSE is printed.

5. **Output**:
   - Prints the RMSE (a measure of error; lower values indicate better model performance on training data).


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_train_pred = model.predict(X_train_scaled)

# Evaluate using RMSE
rmse = mean_squared_error(y_train, y_train_pred)
print(f"RMSE on Training Data: {rmse}")

RMSE on Training Data: 968358188440.7242


### Predict and Compare Actual vs. Predicted Prices
1. **Make Predictions on Test Data**:
   - `y_test_pred = model.predict(X_test_scaled)`:
     - Predicts the target values (`price`) for the test set using the trained model.

2. **Create a Comparison DataFrame**:
   - `comparison_df = pd.DataFrame(...)`:
     - Creates a DataFrame to compare the actual prices (`y_test`) with the predicted prices (`y_test_pred`).
     - Columns:
       - `Actual Price`: The true values of the target variable from the test set.
       - `Predicted Price`: The corresponding predicted values.

3. **Calculate the Difference (Optional)**:
   - `comparison_df['Difference']`:
     - Computes the difference between the actual and predicted prices to analyze the prediction error.

4. **Display the Results**:
   - `print(comparison_df)`:
     - Outputs the comparison DataFrame for inspection of the model's performance on test data.


In [63]:
y_test_pred = model.predict(X_test_scaled)

comparison_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_test_pred
})

# Optionally, add the difference (error)
comparison_df['Difference'] = comparison_df['Actual Price'] - comparison_df['Predicted Price']

# Display the comparison
print(comparison_df)

     Actual Price  Predicted Price    Difference
316       4060000     5.164654e+06 -1.104654e+06
77        6650000     7.224722e+06 -5.747223e+05
360       3710000     3.109863e+06  6.001368e+05
90        6440000     4.612075e+06  1.827925e+06
493       2800000     3.294646e+06 -4.946463e+05
..            ...              ...           ...
15        9100000     4.973331e+06  4.126669e+06
357       3773000     4.336651e+06 -5.636509e+05
39        7910000     7.059063e+06  8.509373e+05
54        7350000     6.398941e+06  9.510591e+05
155       5530000     6.363890e+06 -8.338899e+05

[109 rows x 3 columns]
