## Categorical Analysis

In [1]:
import pandas as pd

df = pd.read_csv('./data/rent.csv')

features = ['bedrooms','bathrooms','latitude','longitude','price']
X,y = df[features], df['price']

X.head(3)

Unnamed: 0,bedrooms,bathrooms,latitude,longitude,price
0,3,1.5,40.7145,-73.9425,3000
1,2,1.0,40.7947,-73.9667,5465
2,1,1.0,40.7388,-74.0018,2850


In [2]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X,y)
score = rf.oob_score_
print(score)

0.928318384568533


In [3]:
print(df['interest_level'].value_counts())

low       15
medium     4
high       2
Name: interest_level, dtype: int64


In [4]:
df['interest_level'] = df['interest_level'].map({'low': 1, 'medium': 2, 'high': 3})
df['interest_level'].value_counts()

1    15
2     4
3     2
Name: interest_level, dtype: int64

In [5]:
X_new, y_new = df[['interest_level'] + features], df['price']

In [6]:
rf.fit(X_new,y_new)
scores = rf.oob_score_
print(scores)

0.922439700241626


In [7]:
df['description'] = df['description'].fillna('')
df['description'] = df['description'].str.lower()
df['features'] = df['features'].fillna('')
df['features'] = df['features'].str.lower()

In [8]:
# has apartment been renovated?
df['renov'] = df['description'].str.contains("renov")

for w in ['doorman', 'parking', 'garage', 'laundry', 
          'Elevator', 'fitness center', 'dishwasher']:
    df[w] = df['features'].str.contains(w)
df[['doorman', 'parking', 'garage', 'laundry']].head(5)

Unnamed: 0,doorman,parking,garage,laundry
0,False,False,False,False
1,True,False,False,False
2,False,False,False,True
3,False,False,False,False
4,False,False,False,False


In [17]:
df["num_desc_words"] = df["description"].apply(lambda x: len(x.split()))
df["features"] = df["features"].apply(lambda x: len(x.split(",")))

In [18]:
df['num_photos'] = df['photos'].apply(lambda x: len(x.split(',')))