# Flats Uncleaned Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
df = pd.read_csv('Data/surat_uncleaned.csv')

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(keep='first')

In [None]:
df.isnull().sum()

In [None]:
for col in df.select_dtypes(include = 'object'):
    df[col] = df[col].str.strip()

In [None]:
df['transaction'].unique()

In [None]:
df['transaction'] = df['transaction'].fillna(df['transaction'].mode()[0])

In [None]:
df['transaction'].unique()

In [None]:
value_counts = df['transaction'].value_counts()

In [None]:
valid_values = value_counts[value_counts > 1].index

df = df[df['transaction'].isin(valid_values)]

df['transaction'] = df['transaction'].replace('Main Road', df['transaction'].mode()[0])

In [None]:
df['transaction'].value_counts()

In [None]:
unrelated_values = ['Unfurnished', 'Semi-Furnished', 'Furnished', '2', '3', 'No', '1', 'Yes', '5', '4', '6', 'New Property']
df['transaction'] = df['transaction'].replace(unrelated_values, np.nan)

In [None]:
df['transaction'].isnull().sum()

In [None]:
df['transaction'] = df['transaction'].fillna(df['transaction'].mode()[0])

In [None]:
df['transaction'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df['status'].unique()

In [None]:
valid_status = ['Ready to Move', 'New Property', 'Freehold', 'Resale', 'Power Of Attorney', 'Const. Age New Construction']
possession_pattern = r"^Poss\. by [A-Za-z]{3} '\d{2}$"
def clean_status(value):
    if pd.isnull(value):
        return np.nan
    if value in valid_status:
        return value
    if re.match(possession_pattern, value):
        return value
    return np.nan  # Unrelated values will be marked as missing

# Apply the cleaning function
df['status'] = df['status'].apply(clean_status)

In [None]:
df['status'] = df['status'].fillna(df['status'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df['floor'].unique()

In [None]:
ground_pattern = r"^Ground out of ([1-9]|[1-9][0-9]|[1-6][0-9]{2}|700)$"
basement_pattern = r"^(Lower|Upper) Basement out of \d+$"

floor_unrelated_values = ['Unfurnished', 'New Property', 'Resale', 'Freehold', 'Furnished', 'Congo', 'Ground', 'Semi-Furnished', 'Other', 'The Polaris Avenue', 'No', 'Yes']

def clean_floor(value):
    if pd.isnull(value):
        return np.nan
    if value in floor_unrelated_values:
        return np.nan
    if re.match(ground_pattern, str(value)):
        return np.nan
    if re.match(basement_pattern, str(value)):
        return np.nan
    return value


df['floor'] = df['floor'].apply(clean_floor) 


In [None]:
df['floor'] = df['floor'].fillna(df['floor'].mode()[0])

In [None]:
df['floor'].unique()

In [None]:
df.isnull().sum()

In [None]:
df['furnishing'].unique()

In [None]:
furnishing_values_keep = ['Unfurnished', 'Semi-Furnished', 'Furnished']
if 'furnishing' != furnishing_values_keep:
    df['furnishing'] = df['furnishing'].where(df['furnishing'].isin(furnishing_values_keep), np.nan)
df['furnishing'] = df['furnishing'].fillna('Unknown')

In [None]:
df['furnishing'].unique()

In [None]:
df.isnull().sum()

In [None]:
df['facing'].unique()

In [None]:
face_patt = r"^\d+(\s+[a-zA-Z]+)?$"

not_face = []

for fc in df['facing']:
    if re.match(face_patt, str(fc).strip()):
        not_face.append(fc)

print(set(not_face))

df['facing'] = df['facing'].replace(not_face, np.nan)
df['facing'] = df['facing'].fillna(df['facing'].mode()[0])

In [None]:

print(df['facing'].unique())

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns = ['description'])

In [None]:
df.isnull().sum()

In [None]:
df['price_per_sqft'] = df['price_per_sqft'].fillna(df['price_per_sqft'].mode()[0])
df['price_per_sqft'].unique()

In [None]:
df['price_per_sqft'].isnull().sum()

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df['price'].unique()

In [None]:
price_pattern = [r"^₹\d+(\.\d+)? Lac$", r"^₹\d+(\.\d+)? Cr$"]
not_same =[]

for p in df['price']:
    if not any(re.match(pattern, str(p)) for pattern in price_pattern):
        not_same.append(p)
print(set(not_same))

In [None]:
df['price'] = df['price'].replace(not_same, np.nan)
df['price'] = df['price'].fillna(df['price'].mode()[0])

In [None]:
'Call for Price' in df['price']

In [None]:
df

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep = 'first')

In [None]:
df['areaWithType'].unique()

In [None]:
areaWithType_unrelated_values = ['Transaction', 'Status']
df['areaWithType'] = df['areaWithType'].replace(areaWithType_unrelated_values, np.nan)
df['areaWithType'] = df['areaWithType'].fillna(df['areaWithType'].mode()[0])

In [None]:
df['square_feet'].unique()

In [None]:
square_feet_patt = r"^\d+\s*sqft$"
not_include = []
for s in df['square_feet']:
    if not re.match(square_feet_patt, str(s)):
        not_include.append(s)
    print(set(s))

In [None]:
df['square_feet'] = df['square_feet'].replace(not_include, np.nan)
df['square_feet'] = df['square_feet'].fillna(df['square_feet'].mode()[0])

In [None]:
df['status'].unique()

In [None]:
status_non_related_values = ['Power Of Attorney', 'Ready to Move', 'New Property', 'Freehold', 'Resale', 'Const. Age New Construction']
df['status'] = df['status'].replace(status_non_related_values, np.nan)
df['status'] = df['status'].fillna(df['status'].mode()[0])

In [None]:
df

In [None]:
df['transaction'].unique()

In [None]:
floor_patt = r"^₹[\d,]+(\.\d+)?\sCr$"
not_floor = []

for f in df['floor']:
    if not re.match(floor_patt, str(f)):
        not_floor.append(f)

print(set(not_floor))


df['floor'] = df['floor'].replace(not_floor, np.nan)
df['floor'] = df['floor'].fillna(df['floor'].mode()[0])

{'7 out of 17', '9 out of 17', '12 out of 14', '2 out of 10', '3 out of 5', '3 out of 19', '9 out of 14', '17 out of 18', '18 out of 21', '4 out of 14', '5 out of 15', '19 out of 19', '8 out of 12', '13 out of 13', '15 out of 20', '10 out of 18', '8 out of 9', '6 out of 18', '9 out of 19', '3 out of 13', '8 out of 20', '5 out of 9', '7 out of 10', '13 out of 14', '5 out of 14', '6 out of 13', '4 out of 12', '6 out of 12', '3 out of 7', '6 out of 14', '11 out of 12', '2 out of 9', '3 out of 4', '1 out of 10', '2 out of 17', '5 out of 8', '10 out of 20', '10 out of 12', '4 out of 9', '12 out of 13', '4 out of 21', '8 out of 8', '9 out of 12', '2 out of 6', '1 out of 9', '2 out of 15', '5 out of 18', '2 out of 19', '1 out of 18', '1 out of 3', '3 out of 12', '1 out of 2', '1 out of 4', '4 out of 4', '6 out of 7', '3 out of 15', '15 out of 18', '13 out of 15', '2 out of 11', '1 out of 14', '9 out of 25', '3 out of 6', '11 out of 11', '1 out of 15', '3 out of 10', '18 out of 19', '5 out of 

  df['floor'] = df['floor'].replace(not_floor, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['floor'] = df['floor'].replace(not_floor, np.nan)


KeyError: 0

In [1022]:
facing_non_valid = ['Yes', '1 Covered,', '45 X 14 ft Sqft']
df['facing'] = df['facing'].replace(facing_non_valid, np.nan)
df['facing'] = df['facing'].fillna(df['facing'].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['facing'] = df['facing'].replace(facing_non_valid, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['facing'] = df['facing'].fillna(df['facing'].mode()[0])
