In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
csv_file = r'C:\Users\jerom\Machine Learning\Lab Ex 1\House_Rent_Dataset.csv'
data = pd.read_csv(csv_file)

# Preprocessing

In [3]:
data

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,2022-05-15,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,2022-07-10,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,2022-07-06,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent


#### Dropping Irrelevant Features

In [4]:
data = data.drop('Posted On', axis=1)
data = data.drop('Area Locality', axis=1)
data = data.drop('Point of Contact', axis=1)

print("\nAfter dropping 'Posted On' column:")
print(data)


After dropping 'Posted On' column:
      BHK   Rent  Size            Floor    Area Type       City  \
0       2  10000  1100  Ground out of 2   Super Area    Kolkata   
1       2  20000   800       1 out of 3   Super Area    Kolkata   
2       2  17000  1000       1 out of 3   Super Area    Kolkata   
3       2  10000   800       1 out of 2   Super Area    Kolkata   
4       2   7500   850       1 out of 2  Carpet Area    Kolkata   
...   ...    ...   ...              ...          ...        ...   
4741    2  15000  1000       3 out of 5  Carpet Area  Hyderabad   
4742    3  29000  2000       1 out of 4   Super Area  Hyderabad   
4743    3  35000  1750       3 out of 5  Carpet Area  Hyderabad   
4744    3  45000  1500     23 out of 34  Carpet Area  Hyderabad   
4745    2  15000  1000       4 out of 5  Carpet Area  Hyderabad   

     Furnishing Status  Tenant Preferred  Bathroom  
0          Unfurnished  Bachelors/Family         2  
1       Semi-Furnished  Bachelors/Family         1  


#### Fixing Floors Format

In [5]:
data.at[105, 'Floor'] = '5 out of 8'
data.at[161, 'Floor'] = '1 out of 2'

In [6]:
data.iloc[105]

BHK                                 1
Rent                             6000
Size                              600
Floor                      5 out of 8
Area Type                 Carpet Area
City                          Kolkata
Furnishing Status         Unfurnished
Tenant Preferred     Bachelors/Family
Bathroom                            1
Name: 105, dtype: object

In [7]:
data.iloc[161]

BHK                                 2
Rent                            10000
Size                              450
Floor                      1 out of 2
Area Type                 Carpet Area
City                          Kolkata
Furnishing Status      Semi-Furnished
Tenant Preferred     Bachelors/Family
Bathroom                            2
Name: 161, dtype: object

In [8]:
for index, floor_value in enumerate(data["Floor"]):
    if "Upper Basement" in floor_value:
        data.at[index, 'Floor'] = floor_value.replace("Upper Basement", "-1")
    elif "Lower Basement" in floor_value:
        data.at[index, 'Floor'] = floor_value.replace("Lower Basement", "-2")
    elif "Ground out" in floor_value:
        data.at[index, 'Floor'] = floor_value.replace("Ground", "1")
    elif floor_value == "Ground":
        data.at[index, 'Floor'] = floor_value.replace("Ground", "1 out of 1")
    elif floor_value == "1":
        data.at[index, 'Floor'] = floor_value.replace("1", "1 out of 1")
    elif floor_value == "3":
         data.at[index, 'Floor'] = floor_value.replace("3", "3 out of 3")
    else:
        data.at[index, 'Floor'] = floor_value
data

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,1 out of 2,Super Area,Kolkata,Unfurnished,Bachelors/Family,2
1,2,20000,800,1 out of 3,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
2,2,17000,1000,1 out of 3,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
3,2,10000,800,1 out of 2,Super Area,Kolkata,Unfurnished,Bachelors/Family,1
4,2,7500,850,1 out of 2,Carpet Area,Kolkata,Unfurnished,Bachelors,1
...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,3 out of 5,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,2
4742,3,29000,2000,1 out of 4,Super Area,Hyderabad,Semi-Furnished,Bachelors/Family,3
4743,3,35000,1750,3 out of 5,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,3
4744,3,45000,1500,23 out of 34,Carpet Area,Hyderabad,Semi-Furnished,Family,2


In [9]:
pattern = r'^-?\d+ out of \d+$'

matches_pattern = data['Floor'].str.match(pattern)
records_not_following_pattern = data[~matches_pattern]

# Print all unique values of 'Floor' for records that don't follow the pattern
unique_values_not_following_pattern = records_not_following_pattern['Floor'].unique()
print(unique_values_not_following_pattern)

[]


In [10]:
data['Floor'].value_counts()

Floor
1 out of 2      730
1 out of 3      502
1 out of 1      332
1 out of 4      315
2 out of 3      312
               ... 
5 out of 21       1
32 out of 59      1
20 out of 32      1
10 out of 37      1
23 out of 34      1
Name: count, Length: 460, dtype: int64

In [11]:

# Define a function to perform the conversion
def convert_floor(row):
    # Split the string into two parts based on "out of"
    numbers = row.split(" out of ")
    
    # Convert the two parts to integers
    room_floor = int(numbers[0])
    total_floor = int(numbers[1])
    
    # Perform the division and convert to a double
    floor = round(room_floor / total_floor, 4)
    
    return floor

# Apply the function to the entire 'Floor' column
data['Floor'] = data['Floor'].apply(convert_floor)

# Print the updated DataFrame
print(data)


      BHK   Rent  Size   Floor    Area Type       City Furnishing Status  \
0       2  10000  1100  0.5000   Super Area    Kolkata       Unfurnished   
1       2  20000   800  0.3333   Super Area    Kolkata    Semi-Furnished   
2       2  17000  1000  0.3333   Super Area    Kolkata    Semi-Furnished   
3       2  10000   800  0.5000   Super Area    Kolkata       Unfurnished   
4       2   7500   850  0.5000  Carpet Area    Kolkata       Unfurnished   
...   ...    ...   ...     ...          ...        ...               ...   
4741    2  15000  1000  0.6000  Carpet Area  Hyderabad    Semi-Furnished   
4742    3  29000  2000  0.2500   Super Area  Hyderabad    Semi-Furnished   
4743    3  35000  1750  0.6000  Carpet Area  Hyderabad    Semi-Furnished   
4744    3  45000  1500  0.6765  Carpet Area  Hyderabad    Semi-Furnished   
4745    2  15000  1000  0.8000  Carpet Area  Hyderabad       Unfurnished   

      Tenant Preferred  Bathroom  
0     Bachelors/Family         2  
1     Bachelors/F

In [12]:
data['Floor'].describe()

count    4746.000000
mean        0.555319
std         0.270375
min        -2.000000
25%         0.333300
50%         0.500000
75%         0.750000
max         1.000000
Name: Floor, dtype: float64

In [13]:
data

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,0.5000,Super Area,Kolkata,Unfurnished,Bachelors/Family,2
1,2,20000,800,0.3333,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
2,2,17000,1000,0.3333,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
3,2,10000,800,0.5000,Super Area,Kolkata,Unfurnished,Bachelors/Family,1
4,2,7500,850,0.5000,Carpet Area,Kolkata,Unfurnished,Bachelors,1
...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,0.6000,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,2
4742,3,29000,2000,0.2500,Super Area,Hyderabad,Semi-Furnished,Bachelors/Family,3
4743,3,35000,1750,0.6000,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,3
4744,3,45000,1500,0.6765,Carpet Area,Hyderabad,Semi-Furnished,Family,2


#### Label Encoding of Categorical Features

In [14]:
# Performing label encoding on Categorical Columns in Alphabetical Order
data['Area Type'] = data['Area Type'].map({'Built Area': 1, 'Carpet Area': 2, 'Super Area': 3})
data['City'] = data['City'].map({'Bangalore': 1, 'Chennai': 2, 'Delhi': 3, 'Hyderabad': 4, 'Kolkata': 5, 'Mumbai': 6})
data['Furnishing Status'] = data['Furnishing Status'].map({'Furnished': 1, 'Semi-Furnished': 2, 'Unfurnished': 3})
data['Tenant Preferred'] = data['Tenant Preferred'].map({'Bachelors': 1, 'Family': 2, 'Bachelors/Family': 3})

# Displaying the resulting DataFrame
print(data)

      BHK   Rent  Size   Floor  Area Type  City  Furnishing Status  \
0       2  10000  1100  0.5000          3     5                  3   
1       2  20000   800  0.3333          3     5                  2   
2       2  17000  1000  0.3333          3     5                  2   
3       2  10000   800  0.5000          3     5                  3   
4       2   7500   850  0.5000          2     5                  3   
...   ...    ...   ...     ...        ...   ...                ...   
4741    2  15000  1000  0.6000          2     4                  2   
4742    3  29000  2000  0.2500          3     4                  2   
4743    3  35000  1750  0.6000          2     4                  2   
4744    3  45000  1500  0.6765          2     4                  2   
4745    2  15000  1000  0.8000          2     4                  3   

      Tenant Preferred  Bathroom  
0                    3         2  
1                    3         1  
2                    3         1  
3                  

In [15]:
lbl_encode = LabelEncoder()

In [16]:
lbl_encode.fit_transform(data['Area Type'])
lbl_encode.fit_transform(data['City'])
lbl_encode.fit_transform(data['Furnishing Status'])
lbl_encode.fit_transform(data['Tenant Preferred'])

array([2, 2, 2, ..., 2, 1, 0], dtype=int64)

In [17]:
data

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,0.5000,3,5,3,3,2
1,2,20000,800,0.3333,3,5,2,3,1
2,2,17000,1000,0.3333,3,5,2,3,1
3,2,10000,800,0.5000,3,5,3,3,1
4,2,7500,850,0.5000,2,5,3,1,1
...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,0.6000,2,4,2,3,2
4742,3,29000,2000,0.2500,3,4,2,3,3
4743,3,35000,1750,0.6000,2,4,2,3,3
4744,3,45000,1500,0.6765,2,4,2,2,2


In [18]:
data['Area Type'].value_counts()

Area Type
3    2446
2    2298
1       2
Name: count, dtype: int64

In [19]:
data.isnull().sum()

BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
dtype: int64

### Training and Test Split

In [20]:
X = data.drop('Rent', axis= 1)
y = data['Rent']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Standardizing the Values

In [22]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Modelling

In [23]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ -657.74138863, 18005.44141467,  1383.61890492, -8557.84131002,
       13092.42803293, -4307.54833508,   668.12784261, 17726.56707394])

### Evaluation

#### Quantitative Evaluation

In [24]:
y_preds = model.predict(X_test)

In [25]:
from sklearn.metrics import mean_squared_error, r2_score

# The coefficients
print("Coefficients: \n", model.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_preds))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_preds))

Coefficients: 
 [ -657.74138863 18005.44141467  1383.61890492 -8557.84131002
 13092.42803293 -4307.54833508   668.12784261 17726.56707394]
Mean squared error: 2223930250.47
Coefficient of determination: 0.44
