In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
df = pd.read_csv('Pune house data.csv')
df

Unnamed: 0,area_type,availability,size,society,total_sqft,bath,balcony,price,site_location
0,Super built-up Area,19-Dec,2 BHK,Coomee,1056,2.0,1.0,39.07,Alandi Road
1,Plot Area,Ready To Move,4 Bedroom,Theanmp,2600,5.0,3.0,120.00,Ambegaon Budruk
2,Built-up Area,Ready To Move,3 BHK,,1440,2.0,3.0,62.00,Anandnagar
3,Super built-up Area,Ready To Move,3 BHK,Soiewre,1521,3.0,1.0,95.00,Aundh
4,Super built-up Area,Ready To Move,2 BHK,,1200,2.0,1.0,51.00,Aundh Road
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00,Pashan
13316,Super built-up Area,Ready To Move,4 BHK,,3600,5.0,,400.00,Paud Road
13317,Built-up Area,Ready To Move,2 BHK,Mahla T,1141,2.0,1.0,60.00,Pirangut
13318,Super built-up Area,18-Jun,4 BHK,SollyCl,4689,4.0,1.0,488.00,Prabhat Road


In [4]:
df.shape


(13320, 9)

In [5]:
columns_to_drop = ['area_type', 'availability', 'society', 'balcony']
df = df.drop(columns=columns_to_drop)

# Display the updated DataFrame
print(df.head())

        size total_sqft  bath   price    site_location
0      2 BHK       1056   2.0   39.07      Alandi Road
1  4 Bedroom       2600   5.0  120.00  Ambegaon Budruk
2      3 BHK       1440   2.0   62.00       Anandnagar
3      3 BHK       1521   3.0   95.00            Aundh
4      2 BHK       1200   2.0   51.00       Aundh Road


In [6]:
df.isnull().sum()

size             16
total_sqft        0
bath             73
price             0
site_location     1
dtype: int64

In [7]:
duplicates=df[df.duplicated()]
duplicates

Unnamed: 0,size,total_sqft,bath,price,site_location
2227,2 BHK,1050,2.0,43.0,Dehu Road
2989,2 BHK,1180,2.0,42.0,Budhwar Peth
3887,3 BHK,1400,2.0,40.43,Kondhwa Khurd
3914,3 BHK,1355,3.0,83.87,Sadashiv Peth
4016,2 BHK,1075,2.0,60.0,Shaniwar Peth
4171,3 BHK,1645,3.0,95.0,Khadki
4267,4 BHK,4104,4.0,360.0,Khadki
5319,3 BHK,1650,3.0,110.0,Karve Road
5461,3 BHK,1400,2.0,40.44,Swargate
5533,3 BHK,1255,3.0,77.68,Nana Peth


In [8]:
df=df.drop_duplicates()
df

Unnamed: 0,size,total_sqft,bath,price,site_location
0,2 BHK,1056,2.0,39.07,Alandi Road
1,4 Bedroom,2600,5.0,120.00,Ambegaon Budruk
2,3 BHK,1440,2.0,62.00,Anandnagar
3,3 BHK,1521,3.0,95.00,Aundh
4,2 BHK,1200,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,5 Bedroom,3453,4.0,231.00,Pashan
13316,4 BHK,3600,5.0,400.00,Paud Road
13317,2 BHK,1141,2.0,60.00,Pirangut
13318,4 BHK,4689,4.0,488.00,Prabhat Road


In [9]:
df.rename(columns={'site_location': 'location','total_sqft': 'Area','bath': 'Bathroom','price': 'Price','size':'BHK'}, inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2 BHK,1056,2.0,39.07,Alandi Road
1,4 Bedroom,2600,5.0,120.00,Ambegaon Budruk
2,3 BHK,1440,2.0,62.00,Anandnagar
3,3 BHK,1521,3.0,95.00,Aundh
4,2 BHK,1200,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,5 Bedroom,3453,4.0,231.00,Pashan
13316,4 BHK,3600,5.0,400.00,Paud Road
13317,2 BHK,1141,2.0,60.00,Pirangut
13318,4 BHK,4689,4.0,488.00,Prabhat Road


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13263 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BHK       13247 non-null  object 
 1   Area      13263 non-null  object 
 2   Bathroom  13190 non-null  float64
 3   Price     13263 non-null  float64
 4   location  13262 non-null  object 
dtypes: float64(2), object(3)
memory usage: 621.7+ KB


In [11]:
df['BHK']

0            2 BHK
1        4 Bedroom
2            3 BHK
3            3 BHK
4            2 BHK
           ...    
13315    5 Bedroom
13316        4 BHK
13317        2 BHK
13318        4 BHK
13319        1 BHK
Name: BHK, Length: 13263, dtype: object

In [12]:
value_to_keep = df['BHK'].iloc[0]

# Assign this value to all rows in the 'BHK' column
df['BHK'] = value_to_keep
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2 BHK,1056,2.0,39.07,Alandi Road
1,2 BHK,2600,5.0,120.00,Ambegaon Budruk
2,2 BHK,1440,2.0,62.00,Anandnagar
3,2 BHK,1521,3.0,95.00,Aundh
4,2 BHK,1200,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2 BHK,3453,4.0,231.00,Pashan
13316,2 BHK,3600,5.0,400.00,Paud Road
13317,2 BHK,1141,2.0,60.00,Pirangut
13318,2 BHK,4689,4.0,488.00,Prabhat Road


In [13]:
df['BHK']=df['BHK'].str.split().str.get(0).astype(int)

In [14]:
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056,2.0,39.07,Alandi Road
1,2,2600,5.0,120.00,Ambegaon Budruk
2,2,1440,2.0,62.00,Anandnagar
3,2,1521,3.0,95.00,Aundh
4,2,1200,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2,3453,4.0,231.00,Pashan
13316,2,3600,5.0,400.00,Paud Road
13317,2,1141,2.0,60.00,Pirangut
13318,2,4689,4.0,488.00,Prabhat Road


In [15]:
def handle_area_range(area_str):
    """
    This function handles area values given as ranges.
    It splits the range string, converts the values to float,
    and returns the average. If the string is not a range,
    it attempts to convert it directly to float.
    """
    if '-' in area_str:
        lower, upper = area_str.split('-')
        return (float(lower.strip()) + float(upper.strip())) / 2
    else:
        try:
            return float(area_str)
        except ValueError:
            return float('nan') # Handle cases where conversion still fails

# Apply the function to the 'Area' column after replacing 'Sq. Meter'
df['Area'] = df['Area'].str.replace('Sq. Meter', '').str.strip()
df['Area'] = df['Area'].apply(handle_area_range)

In [16]:
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,Alandi Road
1,2,2600.0,5.0,120.00,Ambegaon Budruk
2,2,1440.0,2.0,62.00,Anandnagar
3,2,1521.0,3.0,95.00,Aundh
4,2,1200.0,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,Pashan
13316,2,3600.0,5.0,400.00,Paud Road
13317,2,1141.0,2.0,60.00,Pirangut
13318,2,4689.0,4.0,488.00,Prabhat Road


In [17]:
df.isnull().sum()

BHK          0
Area        29
Bathroom    73
Price        0
location     1
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13263 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BHK       13263 non-null  int32  
 1   Area      13234 non-null  float64
 2   Bathroom  13190 non-null  float64
 3   Price     13263 non-null  float64
 4   location  13262 non-null  object 
dtypes: float64(3), int32(1), object(1)
memory usage: 569.9+ KB


In [19]:
df.describe()

Unnamed: 0,BHK,Area,Bathroom,Price
count,13263.0,13234.0,13190.0,13263.0
mean,2.0,1559.410578,2.694011,112.747361
std,0.0,1240.535213,1.343246,149.219924
min,2.0,1.0,1.0,8.0
25%,2.0,1100.0,2.0,50.0
50%,2.0,1277.0,2.0,72.0
75%,2.0,1680.0,3.0,120.0
max,2.0,52272.0,40.0,3600.0


In [20]:
df["Area"].fillna(1559.410578, inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,Alandi Road
1,2,2600.0,5.0,120.00,Ambegaon Budruk
2,2,1440.0,2.0,62.00,Anandnagar
3,2,1521.0,3.0,95.00,Aundh
4,2,1200.0,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,Pashan
13316,2,3600.0,5.0,400.00,Paud Road
13317,2,1141.0,2.0,60.00,Pirangut
13318,2,4689.0,4.0,488.00,Prabhat Road


In [21]:
mode = df["Bathroom"].mode()
mode

0    2.0
Name: Bathroom, dtype: float64

In [22]:
df["Bathroom"].fillna(2.0, inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,Alandi Road
1,2,2600.0,5.0,120.00,Ambegaon Budruk
2,2,1440.0,2.0,62.00,Anandnagar
3,2,1521.0,3.0,95.00,Aundh
4,2,1200.0,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,Pashan
13316,2,3600.0,5.0,400.00,Paud Road
13317,2,1141.0,2.0,60.00,Pirangut
13318,2,4689.0,4.0,488.00,Prabhat Road


In [23]:
mode_location = df["location"].mode()
mode_location

0                    Alandi Road
1                     Anandnagar
2                          Aundh
3                   Balaji Nagar
4                          Baner
5                     Baner road
6                Bhandarkar Road
7                   Bhavani Peth
8                         Bopodi
9               Bund Garden Road
10                 Chandan Nagar
11               Deccan Gymkhana
12                 Dhayari Phata
13                  Fatima Nagar
14                   Ganesh Peth
15                   Ganeshkhind
16                 Ghorpade Peth
17                 Gokhale Nagar
18                      Gultekdi
19    Hadapsar Industrial Estate
20          Jangali Maharaj Road
21                    Kasba Peth
22                       Khadaki
23                       Kharadi
24                       Kondhwa
25                Kondhwa Budruk
26                       Kothrud
27              Law College Road
28                   Lulla Nagar
29                Mangalwar peth
30        

In [24]:
df["location"].fillna("Alandi Road", inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,Alandi Road
1,2,2600.0,5.0,120.00,Ambegaon Budruk
2,2,1440.0,2.0,62.00,Anandnagar
3,2,1521.0,3.0,95.00,Aundh
4,2,1200.0,2.0,51.00,Aundh Road
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,Pashan
13316,2,3600.0,5.0,400.00,Paud Road
13317,2,1141.0,2.0,60.00,Pirangut
13318,2,4689.0,4.0,488.00,Prabhat Road


In [25]:
df.isnull().sum()

BHK         0
Area        0
Bathroom    0
Price       0
location    0
dtype: int64

In [26]:
location_mapping = {
    'Alandi Road': 1, 'Ambegaon Budruk': 2, 'Anandnagar': 3, 'Aundh': 4,
    'Aundh Road': 5, 'Balaji Nagar': 6, 'Baner': 7, 'Baner road': 8,
    'Bhandarkar Road': 9, 'Bhavani Peth': 10, 'Bibvewadi': 11, 'Bopodi': 12,
    'Borivali': 13, 'Borivali East': 14, 'Borivali West': 15, 'Bhosari': 16,
    'Budhwar Peth': 17, 'Bund Garden Road': 18, 'Camp': 19, 'Chandan Nagar': 20,
    'Chandkheda': 21, 'Chandkheda East': 22, 'Chandkheda West': 23,
    'Dapodi': 24, 'Deccan Gymkhana': 25, 'Dehu Road': 26, 'Dhankawadi': 27,
    'Dhayari Phata': 28, 'Dhole Patil Road': 29, 'Erandwane': 30, 'Fatima Nagar': 31,
    'Fergusson College Road': 32, 'Ganesh Peth': 33, 'Ganeshkhind': 34, 'Ghansopara': 35,
    'Ghorpade Peth': 36, 'other': 37, 'Gokhale Nagar': 38, 'Gultekdi': 39, 'Hadapsar': 40,
    'Hadapsar Industrial Estate': 41, 'Hingne Khurd': 42, 'Jangali Maharaj Road': 43,
    'Kalyani Nagar': 44, 'Karve Nagar': 45, 'Karve Road': 46, 'Kasba Peth': 47,
    'Katraj': 48, 'Khadaki': 49, 'Kharadi': 50, 'Kondhwa': 51, 'Kondhwa Budruk': 52,
    'Koregaon Park': 53, 'Kothrud': 54, 'Law College Road': 55, 'Laxmi Road': 56,
    'Lulla Nagar': 57, 'Mahatma Gandhi Road': 58, 'Mangalwar peth': 59,
    'Manik Bagh': 60, 'Market yard': 61, 'Model colony': 62, 'Mukund Nagar': 63,
    'Mulund': 64, 'Mulund East': 65, 'Mulund West': 66, 'Nagar Road': 67,
    'Mundhawa': 68, 'Nana Peth': 69, 'Narayan Peth': 70, 'Narayangaon': 71,
    'Navi Peth': 72, 'Padmavati': 73, 'Parvati Darshan': 74, 'Pashan': 75,
    'Paud Road': 76, 'Pirangut': 77, 'Prabhat Road': 78, 'Pune Railway Station': 79,
    'Rasta Peth': 80, 'Raviwar Peth': 81, 'Sadashiv Peth': 82, 'Sahakar Nagar': 83,
    'Salunke Vihar': 84, 'Sasson Road': 85, 'Satara Road': 86,
    'Senapati Bapat Road': 87, 'Shaniwar Peth': 88, 'Shivaji Nagar': 89,
    'Shukrawar Peth': 90, 'Sinhagad Road': 91, 'Somwar Peth': 92, 'Swargate': 93,
    'Tilak Road': 94, 'Uruli Devachi': 95, 'Vadgaon Budruk': 96, 'Vadgaon Kasba': 97,
    'Wadgaon Sheri': 98, 'Viman Nagar': 99, 'Vishrant Wadi': 100, 'Wagholi': 101,
    'Wakadewadi': 102, 'Wanowrie': 103, 'Warje': 104, 'Yerawada': 105, 'Ghorpadi': 106
}

# Apply location encoding to the dataframe
df['location_encoding'] = df['location'].map(location_mapping)
df[['location', 'location_encoding']]


Unnamed: 0,location,location_encoding
0,Alandi Road,1.0
1,Ambegaon Budruk,2.0
2,Anandnagar,3.0
3,Aundh,4.0
4,Aundh Road,5.0
...,...,...
13315,Pashan,75.0
13316,Paud Road,76.0
13317,Pirangut,77.0
13318,Prabhat Road,78.0


In [27]:
df

Unnamed: 0,BHK,Area,Bathroom,Price,location,location_encoding
0,2,1056.0,2.0,39.07,Alandi Road,1.0
1,2,2600.0,5.0,120.00,Ambegaon Budruk,2.0
2,2,1440.0,2.0,62.00,Anandnagar,3.0
3,2,1521.0,3.0,95.00,Aundh,4.0
4,2,1200.0,2.0,51.00,Aundh Road,5.0
...,...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,Pashan,75.0
13316,2,3600.0,5.0,400.00,Paud Road,76.0
13317,2,1141.0,2.0,60.00,Pirangut,77.0
13318,2,4689.0,4.0,488.00,Prabhat Road,78.0


In [28]:
df = df.drop(['location'], axis=1)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location_encoding
0,2,1056.0,2.0,39.07,1.0
1,2,2600.0,5.0,120.00,2.0
2,2,1440.0,2.0,62.00,3.0
3,2,1521.0,3.0,95.00,4.0
4,2,1200.0,2.0,51.00,5.0
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,75.0
13316,2,3600.0,5.0,400.00,76.0
13317,2,1141.0,2.0,60.00,77.0
13318,2,4689.0,4.0,488.00,78.0


In [29]:
df.rename(columns={'location_encoding': 'location'}, inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,1.0
1,2,2600.0,5.0,120.00,2.0
2,2,1440.0,2.0,62.00,3.0
3,2,1521.0,3.0,95.00,4.0
4,2,1200.0,2.0,51.00,5.0
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,75.0
13316,2,3600.0,5.0,400.00,76.0
13317,2,1141.0,2.0,60.00,77.0
13318,2,4689.0,4.0,488.00,78.0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13263 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BHK       13263 non-null  int32  
 1   Area      13263 non-null  float64
 2   Bathroom  13263 non-null  float64
 3   Price     13263 non-null  float64
 4   location  12853 non-null  float64
dtypes: float64(4), int32(1)
memory usage: 569.9 KB


In [31]:
df.isnull().sum()

BHK           0
Area          0
Bathroom      0
Price         0
location    410
dtype: int64

In [32]:
df["location"].fillna(2.0, inplace=True)
df

Unnamed: 0,BHK,Area,Bathroom,Price,location
0,2,1056.0,2.0,39.07,1.0
1,2,2600.0,5.0,120.00,2.0
2,2,1440.0,2.0,62.00,3.0
3,2,1521.0,3.0,95.00,4.0
4,2,1200.0,2.0,51.00,5.0
...,...,...,...,...,...
13315,2,3453.0,4.0,231.00,75.0
13316,2,3600.0,5.0,400.00,76.0
13317,2,1141.0,2.0,60.00,77.0
13318,2,4689.0,4.0,488.00,78.0


In [33]:
df.isnull().sum()

BHK         0
Area        0
Bathroom    0
Price       0
location    0
dtype: int64

In [34]:
df.to_csv("Pune_Cleaned_data.csv")

In [35]:
output_data=df["Price"]
output_data

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13315    231.00
13316    400.00
13317     60.00
13318    488.00
13319     17.00
Name: Price, Length: 13263, dtype: float64

In [36]:
input_data=df.drop('Price', axis=1)
input_data

Unnamed: 0,BHK,Area,Bathroom,location
0,2,1056.0,2.0,1.0
1,2,2600.0,5.0,2.0
2,2,1440.0,2.0,3.0
3,2,1521.0,3.0,4.0
4,2,1200.0,2.0,5.0
...,...,...,...,...
13315,2,3453.0,4.0,75.0
13316,2,3600.0,5.0,76.0
13317,2,1141.0,2.0,77.0
13318,2,4689.0,4.0,78.0


In [37]:
X_train,x_test,Y_train,y_test=train_test_split(input_data,output_data,test_size=0.20,random_state=42)

In [38]:
X_train.shape

(10610, 4)

In [39]:
x_test.shape

(2653, 4)

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [41]:
models = {
    "                   Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso()
    }

for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")

                   Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.


In [84]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(x_test, y_test)))

                   Linear Regression R^2 Score: 0.41978
 Linear Regression (L2 Regularization) R^2 Score: 0.41979
 Linear Regression (L1 Regularization) R^2 Score: 0.42037


In [88]:
import joblib

In [90]:
joblib.dump(model,"pune_predictor")

['pune_predictor']