In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [97]:
data = pd.read_csv("NY-House-Dataset.csv")

In [98]:
data.head()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,ADDRESS,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME,LONG_NAME,FORMATTED_ADDRESS,LATITUDE,LONGITUDE
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,315000,2,2.0,1400.0,2 E 55th St Unit 803,"New York, NY 10022","2 E 55th St Unit 803New York, NY 10022",New York County,New York,Manhattan,East 55th Street,Regis Residence,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483
1,Brokered by Serhant,Condo for sale,195000000,7,10.0,17545.0,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County,New York,West 57th Street,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991
2,Brokered by Sowae Corp,House for sale,260000,4,2.0,2015.0,620 Sinclair Ave,"Staten Island, NY 10312","620 Sinclair AveStaten Island, NY 10312",United States,New York,Richmond County,Staten Island,Sinclair Avenue,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109
3,Brokered by COMPASS,Condo for sale,69000,3,1.0,445.0,2 E 55th St Unit 908W33,"Manhattan, NY 10022","2 E 55th St Unit 908W33Manhattan, NY 10022",United States,New York,New York County,New York,East 55th Street,"2 E 55th St, New York, NY 10022, USA",40.761398,-73.974613
4,Brokered by Sotheby's International Realty - E...,Townhouse for sale,55000000,7,2.373861,14175.0,5 E 64th St,"New York, NY 10065","5 E 64th StNew York, NY 10065",United States,New York,New York County,New York,East 64th Street,"5 E 64th St, New York, NY 10065, USA",40.767224,-73.969856


In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4801 entries, 0 to 4800
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BROKERTITLE                  4801 non-null   object 
 1   TYPE                         4801 non-null   object 
 2   PRICE                        4801 non-null   int64  
 3   BEDS                         4801 non-null   int64  
 4   BATH                         4801 non-null   float64
 5   PROPERTYSQFT                 4801 non-null   float64
 6   ADDRESS                      4801 non-null   object 
 7   STATE                        4801 non-null   object 
 8   MAIN_ADDRESS                 4801 non-null   object 
 9   ADMINISTRATIVE_AREA_LEVEL_2  4801 non-null   object 
 10  LOCALITY                     4801 non-null   object 
 11  SUBLOCALITY                  4801 non-null   object 
 12  STREET_NAME                  4801 non-null   object 
 13  LONG_NAME         

In [100]:
print("borkertitle      :",data['BROKERTITLE'].nunique())
print("type             :",data['TYPE'].nunique())
print("address          :",data['ADDRESS'].nunique())
print("state            :",data['STATE'].nunique())
print("main address     :",data['MAIN_ADDRESS'].nunique())
print("administrative   :",data['ADMINISTRATIVE_AREA_LEVEL_2'].nunique())
print("locality         :",data['LOCALITY'].nunique())
print("sublocality      :",data['SUBLOCALITY'].nunique())
print("street name      :",data['STREET_NAME'].nunique())
print("long name        :",data['LONG_NAME'].nunique())
print("formatted address:",data['FORMATTED_ADDRESS'].nunique())

borkertitle      : 1036
type             : 13
address          : 4582
state            : 308
main address     : 4583
administrative   : 29
locality         : 11
sublocality      : 21
street name      : 174
long name        : 2731
formatted address: 4550


In [101]:
newdata = data.drop(columns=['BROKERTITLE','ADDRESS', 'MAIN_ADDRESS','LONG_NAME', 'FORMATTED_ADDRESS', 'LATITUDE', 'LONGITUDE'])
newdata.head()

Unnamed: 0,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME
0,Condo for sale,315000,2,2.0,1400.0,"New York, NY 10022",New York County,New York,Manhattan,East 55th Street
1,Condo for sale,195000000,7,10.0,17545.0,"New York, NY 10019",United States,New York,New York County,New York
2,House for sale,260000,4,2.0,2015.0,"Staten Island, NY 10312",United States,New York,Richmond County,Staten Island
3,Condo for sale,69000,3,1.0,445.0,"Manhattan, NY 10022",United States,New York,New York County,New York
4,Townhouse for sale,55000000,7,2.373861,14175.0,"New York, NY 10065",United States,New York,New York County,New York


In [102]:
class OrdinalEncoderAndStandardScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mean=None, var=None, encoding_dict=None):
        self.mean = mean
        self.var = var
        self.encoding_dict = encoding_dict

    def fit(self, x, y=None):
        self.ordinal_encoder = OrdinalEncoder()
        self.scaler = StandardScaler()
        return self

    def transform(self, x, y=None):
        series_name = x.name
        _x = x.to_numpy().reshape(-1, 1)
        _x = self.ordinal_encoder.fit_transform(_x)
        categories = self.ordinal_encoder.categories_
        self.encoding_dict = dict(zip((categories[0]), range(len(categories[0]))))
        _x = np.squeeze(self.scaler.fit_transform(_x))
        self.mean = self.scaler.mean_[0]
        self.var = self.scaler.var_[0]
        return pd.Series(_x, name=series_name)
def convert_to_numerical(column):
    column_name = column
    transformer = OrdinalEncoderAndStandardScalerTransformer()
    pipeline = Pipeline([('transform', transformer)])
    transformed_column = pipeline.fit_transform(data[column_name])
    newdata[column_name] = transformed_column

In [103]:
convert_to_num =['TYPE', 'STATE', 'ADMINISTRATIVE_AREA_LEVEL_2', 'LOCALITY', 'SUBLOCALITY', 'STREET_NAME']
for i in convert_to_num:
    convert_to_numerical(i)

In [104]:
newdata.head()

Unnamed: 0,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME
0,-0.659918,315000,2,2.0,1400.0,0.66097,-0.209342,-0.265118,-0.132014,-0.9412
1,-0.659918,195000000,7,10.0,17545.0,0.639433,0.735697,-0.265118,0.258376,0.382342
2,0.492263,260000,4,2.0,2015.0,1.67323,0.735697,-0.265118,1.039156,1.163448
3,-0.659918,69000,3,1.0,445.0,0.165609,0.735697,-0.265118,0.258376,0.382342
4,1.644443,55000000,7,2.373861,14175.0,0.876344,0.735697,-0.265118,0.258376,0.382342


In [105]:
x = newdata.drop(columns=['PRICE'])
y = newdata['PRICE']

In [106]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [113]:
model = DecisionTreeClassifier()

In [114]:
model.fit(x_train, y_train)

In [115]:
pred = model.predict(x_test)

In [119]:
print(accuracy_score(y_test, pred))

0.08532778355879292
