Using term frequency / inverse document frequency (tf-idf) on ```name``` feature to improve model performance 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import folium
from folium.plugins import HeatMap
from wordcloud import WordCloud, ImageColorGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
# getting the dataset set (.csv)
path = 'data/AB_NYC_2019.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
## drop columns that are of minimal use

X = df.copy()
X.drop(columns=['id', "name", 'host_id', 'host_name', 'last_review'], inplace=True, axis=1)
X['reviews_per_month'] = X['reviews_per_month'].replace(np.nan, 0)

In [4]:
# seperating the data to target variable

y = df['price']
X.drop(labels='price', axis=1, inplace=True);

In [5]:
# handling the categorical variables

X = pd.get_dummies(X, drop_first=True)
X.shape

(48895, 233)

In [13]:
df['name'].fillna(value='-missing-', inplace=True)

In [15]:
## import tf-idf methods

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [16]:
dataset = [
    "I enjoy reading about Machine Learning and Machine Learning is my PhD subject",
    "I would enjoy a walk in the park",
    "I was reading in the library"
]

In [25]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(dataset)

dataframe = pd.DataFrame(tfidf[0].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=['tfidf'])
dataframe.sort_values('tfidf', ascending=False).shape

(17, 1)

In [26]:
dataframe.shape

(17, 1)

In [47]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [48]:
# remove punctuations 
df['preprocessed_text'] = df['name'].str.replace('[^\w\s]', '')

# remove stop words (words that do not carry much meaning)
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# convert all text to lower case
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# stemming to reduce repeat words
stemmer = PorterStemmer()
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

df['preprocessed_text'].head(10)

  df['preprocessed_text'] = df['name'].str.replace('[^\w\s]', '')


0                    clean quiet apt home park
1                         skylit midtown castl
2                 the villag of harlemnew york
3                   cozi entir floor brownston
4    entir apt spaciou studioloft central park
5         larg cozi 1 br apart in midtown east
6                                blissartsspac
7                  larg furnish room near bway
8             cozi clean guest room famili apt
9             cute cozi lower east side 1 bdrm
Name: preprocessed_text, dtype: object

In [49]:
df['name'].head(10)

0                  Clean & quiet apt home by the park
1                               Skylit Midtown Castle
2                 THE VILLAGE OF HARLEM....NEW YORK !
3                     Cozy Entire Floor of Brownstone
4    Entire Apt: Spacious Studio/Loft by central park
5           Large Cozy 1 BR Apartment In Midtown East
6                                     BlissArtsSpace!
7                    Large Furnished Room Near B'way 
8                  Cozy Clean Guest Room - Family Apt
9                  Cute & Cozy Lower East Side 1 bdrm
Name: name, dtype: object

In [64]:
## using calculate term frequency and inverse document frequency (tf-idf)

tfidf_df = TfidfVectorizer(max_features=500, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1,1))

df_tfidf = tfidf_df.fit_transform(df['preprocessed_text'])

In [65]:
# returns a sparse matrix
df_tfidf

<48895x500 sparse matrix of type '<class 'numpy.float64'>'
	with 202268 stored elements in Compressed Sparse Row format>

In [67]:
# converting sparse matrix to dense matrix and pandas dataframe

df_tfidf = pd.DataFrame(df_tfidf.todense())

In [71]:
# num of rows match 
X.shape, df_tfidf.shape

((48895, 233), (48895, 500))

In [72]:
# concatenating the dataframe
merged_df = pd.concat([X, df_tfidf], axis=1)

In [73]:
# shape of the merged dataframes
merged_df.shape

(48895, 733)

In [75]:
# we have a large number of rows so let's create a mini dataset

mini_df = merged_df.iloc[:10000, :]
mini_y = y[:10000]
print(mini_df.shape, mini_y.shape)

(10000, 733) (10000,)


In [77]:
# splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(mini_df, mini_y, test_size=0.1)

# checking the dimensions
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(9000, 733) (1000, 733) (9000,) (1000,)


In [83]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [80]:
# Initialize the model
rf = RandomForestRegressor()

# train the model
rf.fit(X_train, y_train)

RandomForestRegressor()

In [82]:
## doing good on the training set as expected
rf.score(X_train, y_train)

0.867311969114265

In [86]:
## Not good performance from the cross validation scores
cross_val_score(rf, X_train, y_train, n_jobs=-1)

array([-0.09099835,  0.01618483, -0.00669148, -0.15040398, -0.06197971])

In [91]:

print("Average cross validation score across all 5 folds:")
np.mean([-0.09099835,  0.01618483, -0.00669148, -0.15040398, -0.06197971])

Average cross validation score across all 5 folds:


-0.058777738

In [94]:
# Scoring on the test test is quite aweful

rf.score(X_test, y_test)

-0.30215338652714663

In [95]:
## Let's try gradient boosting to see if we can sense any life in tree based models

from sklearn.ensemble import GradientBoostingRegressor

In [97]:
# initialize the model

gb_model = GradientBoostingRegressor()

In [98]:
# cross validation score for gradient boosting

cv_score_gb = cross_val_score(gb_model, X_train, y_train)

In [100]:
## cross validation score for gradient boosting 

print('Cross validation score for gradient boosting:', np.mean(cv_score_gb))

Cross validation score for gradient boosting: -0.1344664927477363
