In [50]:
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None   # default='warn'
data = pd.read_excel('cleaned_data.xlsx')

KeyboardInterrupt: 

In [None]:
data['Price_per_unit_area'] = data['Price_per_unit_area'].astype(str).str.replace(',', '').astype(int)
data['Price'] = data['Price'].astype(str).str.replace(',', '').astype(int)
data['Size'].astype(str)
data['Size'] = data['Size'].str.removesuffix(' sq ft').str.replace(',', '').astype(int)

In [None]:
data2 = data.copy()

In [51]:
data = data2.copy()

- There are a few more binary attributes which we remove from consideration.
- We also do not consider attributes which have a high variance and we cannot generalize them
- `City_id` and `listing_domain_score` is irrelevant for learning rules too.

We drop attributes who have a number of distinct values but no higher level concept for the attribute
We drop these attributes.

In [52]:
atrbs = ['Property_Name', 'Property_id', 'Project_URL', 'builder_id', 'Builder_name', 'Locality_ID', 'Longitude', 'Latitude', 'Sub_urban_ID', 'description', 'City_id', 'Listing_Category', 'is_plot', 'is_RERA_registered', 'is_Apartment', 'is_ready_to_move', 'is_commercial_Listing', 'is_PentaHouse', 'is_studio', 'listing_domain_score', 'No_of_BHK', 'Locality_Name', 'Sub_urban_name', 'Posted_On', 'Price_per_unit_area']

for att in atrbs:
    data.drop(att, axis=1, inplace=True)

We generalise attributes to higher level attributes

> Generalising Price Values (Ascending Concept Heirarchy)

In [53]:
_gold = ((data['Price'] <= 8000000000) & (data['Price'] >= 50000000))
_silver = ((data['Price'] < 50000000) & (data['Price'] >= 2000000))
_bronze = ((data['Price'] < 2000000) & (data['Price'] >= 100000))

data.loc[_gold, ['Price']] = 'Gold'
data.loc[_silver, ['Price']] = 'Silver'
data.loc[_bronze, ['Price']] = 'Bronze'

> Generalising Size Values

In [54]:
_large = ((data['Size'] <= 90000) & (data['Size'] >= 50000))
_medium = ((data['Size'] < 50000) & (data['Size'] >= 5000))
_small = ((data['Size'] < 5000) & (data['Size'] >= 100))

data.loc[_large, ['Size']] = 'Large'
data.loc[_medium, ['Size']] = 'Medium'
data.loc[_small, ['Size']] = 'Small'

> Removing Duplicates 

In [55]:
data = data.drop_duplicates()
data

Unnamed: 0,Property_type,Property_status,Property_building_status,City_name,Price,Size,is_furnished
0,Apartment,Under Construction,ACTIVE,Ahmedabad,Silver,Small,Unfurnished
2,Apartment,Ready to move,ACTIVE,Ahmedabad,Silver,Small,Unfurnished
25,Independent House,Under Construction,ACTIVE,Ahmedabad,Silver,Small,Unfurnished
30,Villa,Under Construction,ACTIVE,Ahmedabad,Silver,Small,Unfurnished
33,Apartment,Under Construction,ACTIVE,Ahmedabad,Silver,Medium,Unfurnished
...,...,...,...,...,...,...,...
141777,Apartment,Ready to move,ACTIVE,Mumbai,Silver,Medium,Unfurnished
142027,Independent House,Ready to move,ACTIVE,Mumbai,Bronze,Small,Unfurnished
142253,Independent House,Ready to move,ACTIVE,Mumbai,Bronze,Small,Furnished
142841,Independent House,Under Construction,ACTIVE,Mumbai,Silver,Small,Furnished


In [56]:
vals = ['Property_status', 'Property_building_status', 'is_furnished']
for att in vals:
    data.drop(att, axis=1, inplace=True)
data = data.drop_duplicates()
data

> Generalising City Values

In [58]:
techCity_cond = ((data['City_name'] == 'Mumbai' ) | (data['City_name'] == 'Bangalore') | (data['City_name'] == 'Ahmedabad') | (data['City_name'] == 'Hyderabad'))
otherCity_cond = ((data['City_name'] == 'Delhi') | (data['City_name'] == 'Kolkata') | (data['City_name'] == 'Chennai') | (data['City_name'] == 'Lucknow'))

data.loc[techCity_cond, ['City_name']] = 'Tech City'
data.loc[otherCity_cond, ['City_name']] = 'Other City'

In [59]:
data = data.drop_duplicates()
data 

Unnamed: 0,Property_type,City_name,Price,Size
0,Apartment,Tech City,Silver,Small
25,Independent House,Tech City,Silver,Small
30,Villa,Tech City,Silver,Small
33,Apartment,Tech City,Silver,Medium
61,Apartment,Tech City,Bronze,Small
116,Villa,Tech City,Silver,Medium
164,Independent Floor,Tech City,Silver,Small
230,Residential Plot,Tech City,Bronze,Small
232,Residential Plot,Tech City,Silver,Small
235,Residential Plot,Tech City,Silver,Medium


In [60]:
data[data['Property_type'] =='Villa']

Unnamed: 0,Property_type,City_name,Price,Size
30,Villa,Tech City,Silver,Small
116,Villa,Tech City,Silver,Medium
341,Villa,Tech City,Gold,Small
931,Villa,Tech City,Gold,Medium
25440,Villa,Tech City,Bronze,Small
27631,Villa,Other City,Silver,Small
27694,Villa,Other City,Bronze,Small
27732,Villa,Other City,Gold,Medium
30604,Villa,Other City,Silver,Medium
30641,Villa,Other City,Gold,Small


the several tuples can be reduced into one by taking the distinct values of that attribute as a set.

In [61]:
for i in range(len(data)):
    for j in range(i+1, len(data)):
        if data.iloc[i]['Property_type'] == data.iloc[j]['Property_type']:
            if data.iloc[i]['City_name'] == data.iloc[j]['City_name']:
                if data.iloc[i]['Size'] == data.iloc[j]['Size']:
                    if data.iloc[i]['Price'].find(data.iloc[j]['Price']) != -1 or data.iloc[j]['Price'].find(data.iloc[i]['Price']) != -1:
                        if data.iloc[i]['Price'].find(data.iloc[j]['Price']) != -1:
                            data.iloc[i]['Price'] = data.iloc[j]['Price']
                        else:
                            data.iloc[j]['Price'] = data.iloc[i]['Price']
                    else:
                        data.iloc[i]['Price'] = data.iloc[i]['Price'] + ', ' + data.iloc[j]['Price']
                        data.iloc[j]['Price'] = data.iloc[i]['Price']

In [62]:
for i in range(len(data.index)):
    for j in range(i+1, len(data.index)):
        if data.iloc[i]['Price'].find(data.iloc[j]['Price']) != -1:
            data.iloc[j]['Price'] = data.iloc[i]['Price']

In [63]:
data = data.drop_duplicates()
data

21


Unnamed: 0,Property_type,City_name,Price,Size
0,Apartment,Tech City,"Silver, Bronze, Gold",Small
25,Independent House,Tech City,"Silver, Gold, Bronze",Small
30,Villa,Tech City,"Silver, Gold, Bronze",Small
33,Apartment,Tech City,"Silver, Gold, Bronze",Medium
116,Villa,Tech City,"Silver, Gold, Bronze",Medium
164,Independent Floor,Tech City,"Silver, Bronze, Gold",Small
230,Residential Plot,Tech City,"Bronze, Silver, Gold",Small
235,Residential Plot,Tech City,"Silver, Gold, Bronze",Medium
475,Residential Plot,Tech City,"Silver, Bronze, Gold",Large
529,Independent House,Tech City,"Silver, Gold, Bronze",Medium


In [64]:
data[data['Property_type'] == 'Villa']

Unnamed: 0,Property_type,City_name,Price,Size
30,Villa,Tech City,"Silver, Gold, Bronze",Small
116,Villa,Tech City,"Silver, Gold, Bronze",Medium
27631,Villa,Other City,"Silver, Bronze, Gold",Small
27732,Villa,Other City,"Gold, Silver",Medium


In [66]:
for i in range(len(data)):
    if data.iloc[i]['Price'].find('Luxury') != -1 and data.iloc[i]['Price'].find('Budget') != -1 and data.iloc[i]['Price'].find('Premium') != -1:
        data.iloc[i]['Price'] = 'Any'

data[data['Property_type'] == 'Villa']

Unnamed: 0,Property_type,City_name,Price,Size
30,Villa,Tech City,"Silver, Gold, Bronze",Small
116,Villa,Tech City,"Silver, Gold, Bronze",Medium
27631,Villa,Other City,"Silver, Bronze, Gold",Small
27732,Villa,Other City,"Gold, Silver",Medium
