# Imports

In [2]:
import pandas as pd
import numpy as np
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go

In [3]:
dtype_mapping = {
    'localityName': pd.StringDtype(),
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.StringDtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': 'float64',
    'latitude': 'float64',
    'transactionType': pd.StringDtype(),
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.StringDtype(),
    'bathrooms': pd.StringDtype(),
    'numberOfBalconied': pd.StringDtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': 'boolean',
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': 'boolean',
    'flooringType_CeramicTiles': 'boolean',
    'flooringType_Marble': 'boolean',
    'flooringType_NormalTilesKotahStone': 'boolean',
    'flooringType_Granite': 'boolean',
    'flooringType_Wooden': 'boolean',
    'flooringType_Mosaic': 'boolean',
    'flooringType_Marbonite': 'boolean',
    'additionalRoom_PujaRoom': 'boolean',
    'additionalRoom_Study': 'boolean',
    'additionalRoom_Store': 'boolean',
    'additionalRoom_ServantRoom': 'boolean',
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
}


In [10]:
df = pd.read_csv(
    'cleaned_data.csv',
    dtype = dtype_mapping,
    index_col=0
)

df.fillna(pd.NA, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33194 entries, 0 to 33193
Data columns (total 54 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   localityName                        32526 non-null  string  
 1   landMarks                           17285 non-null  string  
 2   locality                            32593 non-null  string  
 3   price                               32790 non-null  Int64   
 4   nameOfSociety                       26658 non-null  string  
 5   projectName                         26643 non-null  string  
 6   carpetArea                          26090 non-null  Int64   
 7   coveredArea                         32051 non-null  Int64   
 8   carpetAreaSqft                      26088 non-null  Int64   
 9   possessionStatus                    26081 non-null  string  
 10  developerName                       22986 non-null  string  
 11  flooringType                     

#### Investigating locality columns

In [18]:
(
    df.loc[df['locality'] != df['localityName'], ['locality', 'localityName']]
    .dropna(how='any')
    .drop_duplicates()
    .sort_values(['locality', 'localityName'])
    .to_csv('temp2.csv', index = True)
)

#### Investigating projectName and nameOfSociety column

In [31]:
(
    df.loc[:, ['nameOfSociety', 'projectName']]
    .dropna(how='any')
    .drop_duplicates()
    .query("nameOfSociety != projectName")
    .sort_values('nameOfSociety')
    .to_csv('temp3.csv', index=True)
)

#### Investigating area columns

In [108]:
(
    df['numberOfBalconied']
    .value_counts()
    .sum()
)

np.int64(20103)

In [48]:
26100/33000

0.7909090909090909