# Reading Reviews File

In [1]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

In [2]:
df = pd.read_csv('reviews.csv')
df = df.drop(columns = ['id','date','reviewer_id','reviewer_name'])

# Translating other language comments to English

In [3]:
import langdetect
def is_english(text):
    try:
        return langdetect.detect(text) == 'en'
    except:
        return False
    

non_english_rows = df['comments'].where(df['comments'].apply(is_english)).index[df['comments'].apply(is_english) == False]

In [4]:
from googletrans import Translator
translator = Translator()
for index in non_english_rows:
    df.loc[index, 'comments'] = translator.translate(df.loc[index, 'comments'], dest='en').text
df['translated'] = df['comments']
df.drop(columns = ['comments'])

Unnamed: 0,listing_id,translated
0,44077,We enjoyed our stay very much. The room was co...
1,44077,We have been here 4 nights. Stay in a home is ...
2,44077,Teresa and Hughie were great hosts. They were ...
3,44077,"No surprises, was as described. Very gracious..."
4,44077,"Teresa was a lovely hostess, and we had a deli..."
...,...,...
243178,706148275480196839,"Excellent location, kindness and courtesy!"
243179,706287276585342998,Jenny was able to get us in last minute and ex...
243180,706495821581154410,Very spacious; owners communicative. Only issu...
243181,707685389742134998,What a great host couple and great spot. Super...


# Clean Translated Column

In [5]:
#Clean Columns using Regular Expression
def clean_column(paragraph):
    paragraph = re.sub(r'<[^>]*>', ' ', paragraph)
    paragraph = re.sub(r'[^a-zA-Z ]', ' ', paragraph)
    paragraph = re.sub(r'[^\w\s]', ' ', paragraph)
    paragraph = paragraph.lower()
    return paragraph

In [6]:
df.fillna({'translated':'No Review Given'}, inplace=True)

In [11]:
df = df.drop(columns = ['comments'])

In [13]:
for row in range(df.shape[0]):
    df.iloc[row,1] = clean_column(df.iloc[row,1])

In [14]:
df

Unnamed: 0,listing_id,translated
0,44077,we enjoyed our stay very much the room was co...
1,44077,we have been here nights stay in a home is ...
2,44077,teresa and hughie were great hosts they were ...
3,44077,no surprises was as described very gracious...
4,44077,teresa was a lovely hostess and we had a deli...
...,...,...
243178,706148275480196839,excellent location kindness and courtesy
243179,706287276585342998,jenny was able to get us in last minute and ex...
243180,706495821581154410,very spacious owners communicative only issu...
243181,707685389742134998,what a great host couple and great spot super...


# Write the results to csv file

In [15]:
df['listing_id'] = df['listing_id'].astype(int)

In [16]:
df.to_csv('Reviews_Final.csv', index=False)