In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import PowerTransformer, StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
df = pd.read_parquet(r'C:\Users\aryan\Desktop\Capstone Project\Data Preprocessing New\gurgaon_properties_final_df.parquet')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.sample(1)

Unnamed: 0,Sector,Built Up Area,Bedroom,Bathroom,Balcony,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Property Age,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating,Nearby,Overlooking,Price
3550,Sector 108,1300,3,3,3,0,0,0,7,24,5 to 10 Year Old,Unfurnished,Full,1,0,1,3.7,Education,Club,3.42


In [5]:
df.shape

(9588, 20)

In [6]:
df['Property Age'].value_counts()

Property Age
1 to 5 Year Old     4606
0 to 1 Year Old     3145
5 to 10 Year Old    1388
10+ Year Old         449
Name: count, dtype: int64

In [7]:
df['Furnishing'].value_counts()

Furnishing
Semi Furnished    7413
Unfurnished       1574
Furnished          601
Name: count, dtype: int64

In [8]:
df['Power Backup'].value_counts()

Power Backup
Full       7224
None       1400
Partial     964
Name: count, dtype: int64

In [9]:
df['Furnishing'].replace({'Furnished': 2, 'Semi Furnished': 1, 'Unfurnished': 0}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Furnishing'].replace({'Furnished': 2, 'Semi Furnished': 1, 'Unfurnished': 0}, inplace= True)
  df['Furnishing'].replace({'Furnished': 2, 'Semi Furnished': 1, 'Unfurnished': 0}, inplace= True)


In [10]:
df['Power Backup'].replace({'Full': 2, 'Partial': 1, 'None': 0}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Power Backup'].replace({'Full': 2, 'Partial': 1, 'None': 0}, inplace= True)
  df['Power Backup'].replace({'Full': 2, 'Partial': 1, 'None': 0}, inplace= True)


In [11]:
new_df = pd.get_dummies(df, columns= ['Sector', 'Property Age', 'Nearby', 'Overlooking'], drop_first= True)

In [12]:
X = new_df.drop(columns= 'Price')
y = new_df[['Price']]

In [13]:
import numpy as np

y_transformer = PowerTransformer(method= 'box-cox')
y_transformed = y_transformer.fit_transform(y)

y_log = np.log1p(y)

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_scaled = pd.DataFrame(X_scaled, columns= X.columns)

In [16]:
kfold = KFold(n_splits= 10, shuffle= True, random_state= 42)
scores = cross_val_score(LinearRegression(), X, y_log, cv= kfold, scoring= 'r2')

In [17]:
scores.mean(), scores.std()

(np.float64(0.8353863871769953), np.float64(0.010679393074162699))

In [18]:
lr = LinearRegression()

In [19]:
lr.fit(X_scaled, y_log)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [20]:
pd.set_option('display.max_rows', None)

In [21]:
pd.DataFrame(lr.coef_.reshape(1, 120), columns= X.columns).stack().reset_index().drop(columns= 'level_0').rename(columns= {'level_1': 'feature', 0: 'coef'})

Unnamed: 0,feature,coef
0,Built Up Area,0.132234
1,Bedroom,0.049385
2,Bathroom,0.031054
3,Balcony,0.016622
4,Servant Room,0.005802
5,Store Room,0.005405
6,Study Room,0.002504
7,Floor Num,-0.004996
8,Total Floor,0.055162
9,Furnishing,0.01641


In [22]:
import statsmodels.api as sm

# Reset indices to ensure alignment
X_scaled_reset = X_scaled.reset_index(drop=True)
y_log_reset = y_log.reset_index(drop=True)

# Add constant to X
X_with_const = sm.add_constant(X_scaled_reset)

# Fit model
model = sm.OLS(y_log_reset, X_with_const).fit()

# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.841
Model:                            OLS   Adj. R-squared:                  0.839
Method:                 Least Squares   F-statistic:                     419.5
Date:                Wed, 30 Jul 2025   Prob (F-statistic):               0.00
Time:                        23:44:32   Log-Likelihood:                 5982.8
No. Observations:                9588   AIC:                        -1.173e+04
Df Residuals:                    9468   BIC:                        -1.087e+04
Df Model:                         119                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [23]:
y_log.std()

Price    0.324731
dtype: float32

In [24]:
X['Furnishing'].std()

np.float64(0.4653710865593283)

In [25]:
transformed_value = 0.016410 * (0.324731/(X['Furnishing'].std()))

In [26]:
transformed_value

np.float64(0.01145072365668048)

In [27]:
value_to_reverse = [[transformed_value]]

In [28]:
# y_transformer.inverse_transform(value_to_reverse)
np.expm1(transformed_value)

np.float64(0.011516534145507535)

In [None]:
model = LinearRegression()
model.fit(X_scaled, y_transformed)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
def simulate_price_change(feature_name, old_value, new_value):
    std = X[feature_name].std()
    if std == 0:
        raise ValueError(f"Standard deviation of {feature_name} is 0.")

    delta_std = (new_value - old_value) / std

    feature_index = X.columns.get_loc(feature_name)
    coef = model.coef_.flatten()[feature_index]

    base_input_df = pd.DataFrame([X_scaled.mean()], columns=X.columns)
    base_price_transformed = model.predict(base_input_df)[0][0]
    new_transformed_price = base_price_transformed + coef * delta_std

    # Wrap the values as DataFrame with correct column name(s)
    y_input_df = pd.DataFrame([[base_price_transformed]], columns=['Price'])
    y_new_df = pd.DataFrame([[new_transformed_price]], columns=['Price'])

    original_price_cr = y_transformer.inverse_transform(y_input_df)[0][0]
    new_price_cr = y_transformer.inverse_transform(y_new_df)[0][0]

    print(f"If '{feature_name}' changes from {old_value} to {new_value}:")
    print(f"Price changes from ₹{original_price_cr:.2f} Cr to ₹{new_price_cr:.2f} Cr")
    print(f"Difference: ₹{(new_price_cr - original_price_cr):.2f} Cr")


In [None]:
simulate_price_change('Built Up Area', old_value=500, new_value=900)

If 'Built Up Area' changes from 500 to 9000:
Price changes from ₹2.39 Cr to ₹11.22 Cr
Difference: ₹8.83 Cr


**Export**

In [50]:
import pandas as pd
import numpy as np
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor

# Load dataset
df = pd.read_parquet(r"C:\Users\aryan\Desktop\Capstone Project\Data Preprocessing New\gurgaon_properties_final_df.parquet")

# Encode ordinal features
df['Furnishing'] = df['Furnishing'].replace({'Furnished': 2, 'Semi Furnished': 1, 'Unfurnished': 0})

df['Power Backup'] = df['Power Backup'].replace({'Full': 2, 'Partial': 1, 'None': 0})


# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Sector', 'Property Age', 'Nearby', 'Overlooking'], drop_first=True)

# Define features and target
X = df.drop(columns='Price')
y = df[['Price']]  # DataFrame format for PowerTransformer

# Build pipeline: Scaling → Regression
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("regressor", LinearRegression())
])

# Wrap model to transform target (Box-Cox)
model = TransformedTargetRegressor(
    regressor=pipeline,
    func=None,  # keep as-is
    inverse_func=None,  # will use .inverse_transform
    transformer=PowerTransformer(method='box-cox')
)

# Fit model
model.fit(X, y)

# Save model
joblib.dump(model, r"C:\Users\aryan\Desktop\Capstone Project\Joblib\linear_pipeline_model.pkl")


  df['Furnishing'] = df['Furnishing'].replace({'Furnished': 2, 'Semi Furnished': 1, 'Unfurnished': 0})
  df['Power Backup'] = df['Power Backup'].replace({'Full': 2, 'Partial': 1, 'None': 0})


['C:\\Users\\aryan\\Desktop\\Capstone Project\\Joblib\\linear_pipeline_model.pkl']