In [3]:
# TASK 2
# (1) Load and Explore
# Inspection of the Dataset
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("C:/Users/gupta/Downloads/house_price.csv")
print(df)

    Size  Location  Number of Rooms   Price
0   1360     rural                1  183127
1   4272     rural                9  511492
2   3592  suburban                6  558690
3    966  suburban                3  202414
4   4926     urban                4  861712
..   ...       ...              ...     ...
95  1528  suburban                9  325535
96  1002  suburban                5  271338
97  4993  suburban                8  735535
98  1370     rural                1  152535
99  4988     rural                5  522426

[100 rows x 4 columns]


In [5]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import sklearn
print(sklearn.__version__)

1.5.1


In [7]:
# (2) Normalize Numerical Data:

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("C:/Users/gupta/Downloads/house_price.csv")
# Make copies of the original DataFrame
min_max_scaled_df = df.copy()
standardized_df = df.copy()

# Features to scale
features_to_scale = ['Size', 'Number of Rooms']

# Min-Max Scaling
min_max_scaler = MinMaxScaler()
min_max_scaled_df[features_to_scale] = min_max_scaler.fit_transform(df[features_to_scale])

# Standardization (Z-score)
standard_scaler = StandardScaler()
standardized_df[features_to_scale] = standard_scaler.fit_transform(df[features_to_scale])
print("Min-Max Scaled Data:")
print(min_max_scaled_df.head())

print("\nStandardized Data:")
print(standardized_df.head())

Min-Max Scaled Data:
       Size  Location  Number of Rooms   Price
0  0.185243     rural            0.000  183127
1  0.838305     rural            1.000  511492
2  0.685804  suburban            0.625  558690
3  0.096883  suburban            0.250  202414
4  0.984974     urban            0.375  861712

Standardized Data:
       Size  Location  Number of Rooms   Price
0 -1.031135     rural        -1.391324  183127
1  1.261081     rural         1.625097  511492
2  0.725811  suburban         0.493939  558690
3 -1.341277  suburban        -0.637219  202414
4  1.775885     urban        -0.260166  861712


In [9]:
# (3) Encode Categorical Features:
# 1. One-Hot Encoding (for non-ordinal categories like urban, suburban, rural):
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)

location_encoded = one_hot_encoder.fit_transform(df[['Location']])

encoded_columns = one_hot_encoder.get_feature_names_out(['Location'])
location_encoded_df = pd.DataFrame(location_encoded, columns=encoded_columns)

# Combine with the rest of the data
df_one_hot = pd.concat([df.drop('Location', axis=1), location_encoded_df], axis=1)
df_one_hot.head()


Unnamed: 0,Size,Number of Rooms,Price,Location_rural,Location_suburban,Location_urban
0,1360,1,183127,1.0,0.0,0.0
1,4272,9,511492,1.0,0.0,0.0
2,3592,6,558690,0.0,1.0,0.0
3,966,3,202414,0.0,1.0,0.0
4,4926,4,861712,0.0,0.0,1.0


In [11]:
# (3) Encode Categorical Features:
# 2. Label Encoding (if Location were ordinal):
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_label_encoded = df.copy()
df_label_encoded['Location_Label'] = label_encoder.fit_transform(df['Location'])
print(df_label_encoded[['Location', 'Location_Label']].head())

   Location  Location_Label
0     rural               0
1     rural               0
2  suburban               1
3  suburban               1
4     urban               2


In [15]:
# TASK 3
# Analyze Predictors:
# Compute correlation matrix
import pandas as pd

# Load the dataset
df = pd.read_csv("C:/Users/gupta/Downloads/house_price.csv")

# Inspect data types
print("Data types before handling categorical variables:")
print(df.info())

# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns

# Apply one-hot encoding to categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Calculate the correlation matrix for the encoded DataFrame
correlation_matrix = df_encoded.corr()

# Get correlations with the 'Price' variable
price_correlations = correlation_matrix['Price'].sort_values(ascending=False)

# Display the correlations
print("\nCorrelations with 'Price' variable after one-hot encoding:")
print(price_correlations)

# Identify low-impact predictors (e.g., absolute correlation less than 0.1)
low_impact_threshold = 0.1
low_impact_predictors = price_correlations[abs(price_correlations) < low_impact_threshold].index.tolist()

print(f"\nLow-impact predictors (absolute correlation < {low_impact_threshold}):")
print(low_impact_predictors)

Data types before handling categorical variables:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Size             100 non-null    int64 
 1   Location         100 non-null    object
 2   Number of Rooms  100 non-null    int64 
 3   Price            100 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 3.3+ KB
None

Correlations with 'Price' variable after one-hot encoding:
Price                1.000000
Size                 0.805485
Location_urban       0.371568
Number of Rooms      0.160887
Location_suburban   -0.022048
Name: Price, dtype: float64

Low-impact predictors (absolute correlation < 0.1):
['Location_suburban']


In [17]:
# TASK 4
# Model Training
# Train-Test Split:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("C:/Users/gupta/Downloads/house_price.csv")

# Identify categorical columns and apply one-hot encoding
categorical_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define features (X) and target (y)
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

# Divide the dataset into training and testing sets (80% train, 20% test)
# Ensure the split is random but reproducible using random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# Save the training and testing sets to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

Shape of X_train: (80, 4)
Shape of X_test: (20, 4)
Shape of y_train: (80,)
Shape of y_test: (20,)


In [19]:
# Train a Linear Regression Model:
from sklearn.linear_model import LinearRegression
import pandas as pd

# Load the training data (assuming they were saved in the previous step)
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze() # Use .squeeze() to convert DataFrame to Series if it's a single column

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

print("Linear Regression model has been successfully fitted to the training data.")

Linear Regression model has been successfully fitted to the training data.


In [21]:
# TASK 5
# Evaluation Metrics:
#RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the training data to re-fit the model (as the model object might not persist across sessions)
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze()

# Load the test data
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv').squeeze()

# Initialize and fit the Linear Regression model again (to ensure the model is available for prediction)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Root Mean Square Error (RMSE): {rmse:.2f}")

Root Mean Square Error (RMSE): 34338.50


In [23]:
# R² (Coefficient of Determination) 
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the training data to re-fit the model (as the model object might not persist across sessions)
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze()

# Load the test data
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv').squeeze()

# Initialize and fit the Linear Regression model again (to ensure the model is available for prediction)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print(f"R-squared (Coefficient of Determination): {r2:.2f}")

R-squared (Coefficient of Determination): 0.96
