## Import Libraries

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Reading Translated reviews file reviews_translated.csv data and loading it in Dataframe

In [2]:
reviews_file = 'data/reviews_translated.csv'
transalated_reviews_df = pd.read_csv(reviews_file)

display(transalated_reviews_df)

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,translated_comments
0,0,44077,203703,2011-03-20,393348,Christian,We enjoyed our stay very much. The room was co...,we enjoyed our stay very much. the room was co...,we enjoyed our stay very much. the room was co...
1,1,44077,211369,2011-03-28,444004,Solidea,We have been here 4 nights. Stay in a home is ...,we have been here 4 nights. stay in a home is ...,we have been here 4 nights. stay in a home is ...
2,2,44077,234215,2011-04-21,465058,Michael And Isabelle,Teresa and Hughie were great hosts. They were ...,teresa and hughie were great hosts. they were ...,teresa and hughie were great hosts. they were ...
3,3,44077,261843,2011-05-13,490005,Weston,"No surprises, was as described. Very gracious...","no surprises, was as described. very gracious ...","no surprises, was as described. very gracious ..."
4,4,44077,268148,2011-05-17,520460,Barbara,"Teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli..."
...,...,...,...,...,...,...,...,...,...
243178,243178,706148275480196839,710688960996064975,2022-09-07,271971647,Chiara,"Ottima posizione, gentilezza e cortesia!","ottima posizione, gentilezza e cortesia!","excellent location, kindness and courtesy!"
243179,243179,706287276585342998,709980346451047198,2022-09-06,89845537,Kathy,Jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...
243180,243180,706495821581154410,713622038204769661,2022-09-11,27973369,Phillip,Very spacious; owners communicative. Only issu...,very spacious; owners communicative. only issu...,very spacious; owners communicative. only issu...
243181,243181,707685389742134998,712895789915246258,2022-09-10,302958930,Jacob,What a great host couple and great spot. Super...,what a great host couple and great spot. super...,what a great host couple and great spot. super...


## Removing Unnamed: 0 column from dataframe

In [3]:
transalated_reviews_df.drop('Unnamed: 0', axis=1, inplace=True)
display(transalated_reviews_df)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,translated_comments
0,44077,203703,2011-03-20,393348,Christian,We enjoyed our stay very much. The room was co...,we enjoyed our stay very much. the room was co...,we enjoyed our stay very much. the room was co...
1,44077,211369,2011-03-28,444004,Solidea,We have been here 4 nights. Stay in a home is ...,we have been here 4 nights. stay in a home is ...,we have been here 4 nights. stay in a home is ...
2,44077,234215,2011-04-21,465058,Michael And Isabelle,Teresa and Hughie were great hosts. They were ...,teresa and hughie were great hosts. they were ...,teresa and hughie were great hosts. they were ...
3,44077,261843,2011-05-13,490005,Weston,"No surprises, was as described. Very gracious...","no surprises, was as described. very gracious ...","no surprises, was as described. very gracious ..."
4,44077,268148,2011-05-17,520460,Barbara,"Teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli..."
...,...,...,...,...,...,...,...,...
243178,706148275480196839,710688960996064975,2022-09-07,271971647,Chiara,"Ottima posizione, gentilezza e cortesia!","ottima posizione, gentilezza e cortesia!","excellent location, kindness and courtesy!"
243179,706287276585342998,709980346451047198,2022-09-06,89845537,Kathy,Jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...
243180,706495821581154410,713622038204769661,2022-09-11,27973369,Phillip,Very spacious; owners communicative. Only issu...,very spacious; owners communicative. only issu...,very spacious; owners communicative. only issu...
243181,707685389742134998,712895789915246258,2022-09-10,302958930,Jacob,What a great host couple and great spot. Super...,what a great host couple and great spot. super...,what a great host couple and great spot. super...


## Confirming if there any null values in any column

In [4]:
transalated_reviews_df.isnull().sum()

listing_id               0
id                       0
date                     0
reviewer_id              0
reviewer_name            0
comments                18
clean_comments         151
translated_comments    549
dtype: int64

## Replacing null values with 'none' string

In [5]:
transalated_reviews_df.fillna('none', inplace=True)

## Confirming if there any null values in any column

In [6]:
transalated_reviews_df.isnull().sum()

listing_id             0
id                     0
date                   0
reviewer_id            0
reviewer_name          0
comments               0
clean_comments         0
translated_comments    0
dtype: int64

## Removing all Punctuations and Stopwords from the reviews

In [7]:
english_stopwords = set(stopwords.words("english"))

In [8]:
def remove_punctuation_stopwords(review) : 
    words = review.split()
    new_sentence = " "
    
    for word in words : 
        new_word = word.translate(str.maketrans('', '', string.punctuation))
        if new_word not in english_stopwords:
            new_sentence += new_word + " "
        
   
    return new_sentence.strip().lower()

In [9]:
transalated_reviews_df['tf_idf_reviews'] = transalated_reviews_df['translated_comments'].astype(str).apply(remove_punctuation_stopwords)


In [10]:
display(transalated_reviews_df)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,translated_comments,tf_idf_reviews
0,44077,203703,2011-03-20,393348,Christian,We enjoyed our stay very much. The room was co...,we enjoyed our stay very much. the room was co...,we enjoyed our stay very much. the room was co...,enjoyed stay much room comfortable neat clean ...
1,44077,211369,2011-03-28,444004,Solidea,We have been here 4 nights. Stay in a home is ...,we have been here 4 nights. stay in a home is ...,we have been here 4 nights. stay in a home is ...,4 nights stay home best way improve englishver...
2,44077,234215,2011-04-21,465058,Michael And Isabelle,Teresa and Hughie were great hosts. They were ...,teresa and hughie were great hosts. they were ...,teresa and hughie were great hosts. they were ...,teresa hughie great hosts welcoming us childre...
3,44077,261843,2011-05-13,490005,Weston,"No surprises, was as described. Very gracious...","no surprises, was as described. very gracious ...","no surprises, was as described. very gracious ...",surprises described gracious host nice place s...
4,44077,268148,2011-05-17,520460,Barbara,"Teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli...","teresa was a lovely hostess, and we had a deli...",teresa lovely hostess delightful stay cottage ...
...,...,...,...,...,...,...,...,...,...
243178,706148275480196839,710688960996064975,2022-09-07,271971647,Chiara,"Ottima posizione, gentilezza e cortesia!","ottima posizione, gentilezza e cortesia!","excellent location, kindness and courtesy!",excellent location kindness courtesy
243179,706287276585342998,709980346451047198,2022-09-06,89845537,Kathy,Jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...,jenny was able to get us in last minute and ex...,jenny able get us last minute extend stay last...
243180,706495821581154410,713622038204769661,2022-09-11,27973369,Phillip,Very spacious; owners communicative. Only issu...,very spacious; owners communicative. only issu...,very spacious; owners communicative. only issu...,spacious owners communicative issue wifi didnt...
243181,707685389742134998,712895789915246258,2022-09-10,302958930,Jacob,What a great host couple and great spot. Super...,what a great host couple and great spot. super...,what a great host couple and great spot. super...,great host couple great spot super clean brand...


## Applying TF-IDF technique to get important features

In [11]:
tf_idf_reviews_list = list(transalated_reviews_df['tf_idf_reviews'])

In [12]:
tf_idf_vectorizer = TfidfVectorizer(analyzer='word', use_idf=True, max_features=500, ngram_range=(1, 2))
tf_idf_reviews = tf_idf_vectorizer.fit_transform(tf_idf_reviews_list)
tf_idf_reviews_array = tf_idf_reviews.toarray()

In [13]:
tf_idf_column_names = [feature_name + '_tfidf_ftr' for feature_name in tf_idf_vectorizer.get_feature_names()]
tf_idf_review_df = pd.DataFrame(data = tf_idf_reviews_array, columns = tf_idf_column_names)
tf_idf_review_df['listing_id'] = transalated_reviews_df['listing_id']
display(tf_idf_review_df)

Unnamed: 0,10_tfidf_ftr,10 minutes_tfidf_ftr,100_tfidf_ftr,15_tfidf_ftr,20_tfidf_ftr,30_tfidf_ftr,able_tfidf_ftr,absolutely_tfidf_ftr,access_tfidf_ftr,accessible_tfidf_ftr,...,wonderful host_tfidf_ftr,work_tfidf_ftr,worked_tfidf_ftr,would_tfidf_ftr,would definitely_tfidf_ftr,would highly_tfidf_ftr,would recommend_tfidf_ftr,would stay_tfidf_ftr,youre_tfidf_ftr,listing_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.130134,0.000000,0.0,0.216544,0.0,0.0,44077
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,44077
2,0.0,0.0,0.0,0.0,0.0,0.0,0.186763,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,44077
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.195261,0.000000,0.0,0.324916,0.0,0.0,44077
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.256238,0.000000,0.0,0.000000,0.0,0.0,44077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243178,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,706148275480196839
243179,0.0,0.0,0.0,0.0,0.0,0.0,0.286590,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,706287276585342998
243180,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,706495821581154410
243181,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.101077,0.149678,0.0,0.000000,0.0,0.0,707685389742134998


## Grouping by Listing_ID and finding mean of TF-IDF features

In [14]:
tf_idf_features_df = tf_idf_review_df.groupby('listing_id').mean()
display(tf_idf_features_df)

Unnamed: 0_level_0,10_tfidf_ftr,10 minutes_tfidf_ftr,100_tfidf_ftr,15_tfidf_ftr,20_tfidf_ftr,30_tfidf_ftr,able_tfidf_ftr,absolutely_tfidf_ftr,access_tfidf_ftr,accessible_tfidf_ftr,...,wonderful_tfidf_ftr,wonderful host_tfidf_ftr,work_tfidf_ftr,worked_tfidf_ftr,would_tfidf_ftr,would definitely_tfidf_ftr,would highly_tfidf_ftr,would recommend_tfidf_ftr,would stay_tfidf_ftr,youre_tfidf_ftr
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44077,0.004663,0.002446,0.000000,0.014725,0.006068,0.003582,0.002419,0.008140,0.006169,0.006325,...,0.034061,0.004163,0.000000,0.004379,0.036848,0.008609,0.004841,0.008816,0.006540,0.002551
85156,0.006780,0.001983,0.000907,0.011244,0.002703,0.003059,0.001299,0.014373,0.000577,0.008387,...,0.033942,0.002535,0.000000,0.004024,0.027349,0.009746,0.007798,0.003896,0.006721,0.002471
159889,0.006429,0.002499,0.003418,0.007292,0.010938,0.006483,0.006822,0.005211,0.011620,0.003004,...,0.017880,0.002735,0.003237,0.001203,0.024377,0.007249,0.000524,0.012905,0.003262,0.007829
162809,0.004214,0.000000,0.004433,0.001089,0.003573,0.005012,0.004115,0.006906,0.005705,0.006170,...,0.013119,0.004768,0.000672,0.001287,0.022313,0.004064,0.006147,0.006321,0.005881,0.006164
165828,0.007357,0.002225,0.002213,0.010164,0.011786,0.000000,0.017447,0.000000,0.020526,0.015292,...,0.022630,0.004384,0.003114,0.001701,0.044431,0.002678,0.011711,0.029830,0.010904,0.007039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707685389742134998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.101077,0.149678,0.000000,0.000000,0.000000,0.000000
707825078259308780,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
708679904448712003,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
709451504510289772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.269414,0.000000,0.000000,0.000000,0.000000,0.000000


## Applying Sentiment Analysis to get Sentiment related features for reviews

In [15]:
reviews_sentiment_df = pd.DataFrame()
reviews_sentiment_df['listing_id'] = transalated_reviews_df['listing_id']

In [16]:
sentiment_analyzer = SentimentIntensityAnalyzer()
reviews_sentiment_df['reviews_sentiment'] = transalated_reviews_df['tf_idf_reviews'].apply(lambda review: sentiment_analyzer.polarity_scores(review))

In [17]:
reviews_sentiment_df['reviews_sentiment_postive'] = reviews_sentiment_df['reviews_sentiment'].apply(lambda review: review['pos'])
reviews_sentiment_df['reviews_sentiment_negative'] = reviews_sentiment_df['reviews_sentiment'].apply(lambda review: review['neg'])
reviews_sentiment_df['reviews_sentiment_neutral'] = reviews_sentiment_df['reviews_sentiment'].apply(lambda review: review['neu'])

In [18]:
reviews_sentiment_df.drop('reviews_sentiment', axis=1, inplace=True)
reviews_sentiment_df

Unnamed: 0,listing_id,reviews_sentiment_postive,reviews_sentiment_negative,reviews_sentiment_neutral
0,44077,0.470,0.108,0.422
1,44077,0.629,0.000,0.371
2,44077,0.446,0.000,0.554
3,44077,0.618,0.000,0.382
4,44077,0.606,0.000,0.394
...,...,...,...,...
243178,706148275480196839,0.902,0.000,0.098
243179,706287276585342998,0.367,0.000,0.633
243180,706495821581154410,0.078,0.137,0.785
243181,707685389742134998,0.728,0.000,0.272


## Grouping by Listing_ID and finding mean of Sentiment features

In [19]:
reviews_sentiment_features_df = reviews_sentiment_df.groupby('listing_id').mean()
display(reviews_sentiment_features_df)

Unnamed: 0_level_0,reviews_sentiment_postive,reviews_sentiment_negative,reviews_sentiment_neutral
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
44077,0.467950,0.015911,0.516147
85156,0.487905,0.015724,0.496352
159889,0.466158,0.017792,0.516045
162809,0.502457,0.018242,0.479325
165828,0.404859,0.022406,0.572719
...,...,...,...
707685389742134998,0.728000,0.000000,0.272000
707825078259308780,0.430000,0.000000,0.570000
708679904448712003,0.479000,0.000000,0.521000
709451504510289772,0.280000,0.000000,0.720000


## Merging Review Sentiment Features and TF-IDF Features

In [20]:
reviews_features_df = pd.merge(tf_idf_features_df, reviews_sentiment_features_df, on='listing_id', how='left')
display(reviews_features_df)

Unnamed: 0_level_0,10_tfidf_ftr,10 minutes_tfidf_ftr,100_tfidf_ftr,15_tfidf_ftr,20_tfidf_ftr,30_tfidf_ftr,able_tfidf_ftr,absolutely_tfidf_ftr,access_tfidf_ftr,accessible_tfidf_ftr,...,worked_tfidf_ftr,would_tfidf_ftr,would definitely_tfidf_ftr,would highly_tfidf_ftr,would recommend_tfidf_ftr,would stay_tfidf_ftr,youre_tfidf_ftr,reviews_sentiment_postive,reviews_sentiment_negative,reviews_sentiment_neutral
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44077,0.004663,0.002446,0.000000,0.014725,0.006068,0.003582,0.002419,0.008140,0.006169,0.006325,...,0.004379,0.036848,0.008609,0.004841,0.008816,0.006540,0.002551,0.467950,0.015911,0.516147
85156,0.006780,0.001983,0.000907,0.011244,0.002703,0.003059,0.001299,0.014373,0.000577,0.008387,...,0.004024,0.027349,0.009746,0.007798,0.003896,0.006721,0.002471,0.487905,0.015724,0.496352
159889,0.006429,0.002499,0.003418,0.007292,0.010938,0.006483,0.006822,0.005211,0.011620,0.003004,...,0.001203,0.024377,0.007249,0.000524,0.012905,0.003262,0.007829,0.466158,0.017792,0.516045
162809,0.004214,0.000000,0.004433,0.001089,0.003573,0.005012,0.004115,0.006906,0.005705,0.006170,...,0.001287,0.022313,0.004064,0.006147,0.006321,0.005881,0.006164,0.502457,0.018242,0.479325
165828,0.007357,0.002225,0.002213,0.010164,0.011786,0.000000,0.017447,0.000000,0.020526,0.015292,...,0.001701,0.044431,0.002678,0.011711,0.029830,0.010904,0.007039,0.404859,0.022406,0.572719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707685389742134998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.101077,0.149678,0.000000,0.000000,0.000000,0.000000,0.728000,0.000000,0.272000
707825078259308780,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.430000,0.000000,0.570000
708679904448712003,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.479000,0.000000,0.521000
709451504510289772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.269414,0.000000,0.000000,0.000000,0.000000,0.000000,0.280000,0.000000,0.720000


## Confirming Datatypes of the dataframe are all numeric

In [21]:
datatypes = pd.DataFrame(reviews_features_df.dtypes)
pd.set_option('display.max_rows', None)
display(datatypes)


Unnamed: 0,0
10_tfidf_ftr,float64
10 minutes_tfidf_ftr,float64
100_tfidf_ftr,float64
15_tfidf_ftr,float64
20_tfidf_ftr,float64
30_tfidf_ftr,float64
able_tfidf_ftr,float64
absolutely_tfidf_ftr,float64
access_tfidf_ftr,float64
accessible_tfidf_ftr,float64


## Confirming if there any null values in any column


In [22]:
reviews_features_df.isnull().sum()

10_tfidf_ftr                      0
10 minutes_tfidf_ftr              0
100_tfidf_ftr                     0
15_tfidf_ftr                      0
20_tfidf_ftr                      0
30_tfidf_ftr                      0
able_tfidf_ftr                    0
absolutely_tfidf_ftr              0
access_tfidf_ftr                  0
accessible_tfidf_ftr              0
accommodating_tfidf_ftr           0
accommodation_tfidf_ftr           0
across_tfidf_ftr                  0
advice_tfidf_ftr                  0
airbnb_tfidf_ftr                  0
airport_tfidf_ftr                 0
also_tfidf_ftr                    0
although_tfidf_ftr                0
always_tfidf_ftr                  0
amazing_tfidf_ftr                 0
amenities_tfidf_ftr               0
anyone_tfidf_ftr                  0
anything_tfidf_ftr                0
apartment_tfidf_ftr               0
apartment clean_tfidf_ftr         0
apartment great_tfidf_ftr         0
apartment well_tfidf_ftr          0
appreciated_tfidf_ftr       

## Storing final features set in csv

In [23]:
reviews_features_df.to_csv('data/review_features.csv')
