# 1. Data Preprocessing and Cleaning

## Load Dataset

In [None]:
from google.colab import drive
import os
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define project paths
project_dir = '/content/drive/MyDrive/4awesome/'
data_dir = '/content/drive/MyDrive/4awesome/Data'

reviews = pd.read_csv(os.path.join(data_dir, 'reviews.csv'))

Mounted at /content/drive


## Google Maps Restaurant Reviews Dataset
* business_name: Name of the restaurant
* author_name: Name of the commentator
* text: Text of the review
* photo: Photo path of the review
* rating: Rating score of the review
* rating_category: Target Variable to Predict (Related to the photo of the review)


## Inspect Dataset

In [None]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    1100 non-null   object
 1   author_name      1100 non-null   object
 2   text             1100 non-null   object
 3   photo            1100 non-null   object
 4   rating           1100 non-null   int64 
 5   rating_category  1100 non-null   object
dtypes: int64(1), object(5)
memory usage: 51.7+ KB


In [None]:
reviews.describe()

Unnamed: 0,rating
count,1100.0
mean,3.912727
std,1.218459
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [None]:
reviews.head()

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu


In [None]:
reviews.shape

(1100, 6)

## Data Preprocessing and Cleaning

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import string

nltk.download("stopwords")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Handling duplicates and missing values

In [None]:
reviews.drop_duplicates(inplace=True)
reviews.dropna(subset=["text"], inplace=True)
print("After cleaning shape:", reviews.shape) # Dataset has no rows with duplicate/missing values

After cleaning shape: (1100, 6)


### Preprocessing text column (Lemmatization)

In [None]:
#stop_words = set(stopwords.words("english"))
#lemmatizer = WordNetLemmatizer()

#def clean_text(text):
 #   text = str(text).lower()                           # Lowercase
  #  text = re.sub(r"http\S+|www\S+", " ", text)        # Remove URLs (E.g. promoting their advertisement sites)
   # text = re.sub(r"\s+", " ", text).strip()           # Remove extra spaces
    #tokens = text.split()
    #tokens = [t for t in tokens if t not in stop_words]
    #tokens = [lemmatizer.lemmatize(t) for t in tokens]
    #return " ".join(tokens)

#reviews["cleaned_text"] = reviews["text"].apply(clean_text)

#reviews.head()

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()

    ### Store original for feature extraction
    original_text = text

    text = re.sub(r"http\S+|www\S+|https\S+", "", text) #remove urls
    text = re.sub(r"\s+", " ", text).strip() #remove extra spaces
    text = re.sub(r'\S+@\S+', '', text) #remove email address
    #Remove phone numbers
    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
    text = re.sub(r'\b\d{10}\b', '', text)
    #Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Tokenize and process
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

reviews["cleaned_text"] = reviews["text"].apply(clean_text)

reviews.head()


Unnamed: 0,business_name,author_name,text,photo,rating,rating_category,cleaned_text
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste,went marmaris wife holiday. chose restaurant p...
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu,holiday marmaris ate fit food. really good foo...
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere,price affordable. menu photo cost 108 liras. w...
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere,turkey's cheapest artisan restaurant food deli...
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu,know look term price performance point; taste;...


In [None]:
### to show examples of cleaned_text

pd.set_option('display.max_colwidth', None)

display(reviews["cleaned_text"])

# reset it back
pd.set_option('display.max_colwidth', 50)

Unnamed: 0,cleaned_text
0,went marmaris wife holiday. chose restaurant place dinner based review wanted juicy food. first went serious queue. proceed taking food want form open buffet. vegetable dish meat dish plentiful. also dessert wanted it. get want pay cashier. go card work cash. lot food variety. food price unbelievably cheap. paid 84 tl meal here. included buttermilk bread. unfortunately can't say clean place..
1,holiday marmaris ate fit food. really good food cheap nice. eating much bread want big plus satisfied without bread. place recommend go marmaris. july 1 small increase even price hike cheap. leave photo latest price breakfast below. serious queue. proceed taking food want form open buffet. vegetable dish meat dish plentiful. also dessert wanted it. get want pay cashier. go card work cash. lot food variety. food price unbelievably cheap. paid 84 tl meal here. included buttermilk bread. unfortunately can't say clean place..
2,price affordable. menu photo cost 108 liras. wait 10-15 minute food. staff annoying. well taste good. boiled meat delicious.
3,turkey's cheapest artisan restaurant food delicious!
4,know look term price performance point; taste; yigit restaurant writes big plus come work region.
...,...
1095,many type pizza; surprised one want taste. found successful term taste. atmosphere service good.
1096,tried smoked ribeye pizza; dough thin tasty.
1097,crowded expensive place.
1098,bad. crowded; lighting outside; could look menu phone flashlights.


## Add columns for text length and cleaned_text length

In [None]:
reviews["text_length"] = reviews["text"].apply(lambda x: len(x.split()))
reviews["cleaned_text_length"] = reviews["cleaned_text"].apply(lambda x: len(x.split()))
reviews.head()

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category,cleaned_text,text_length,cleaned_text_length
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste,went marmaris wife holiday. chose restaurant p...,130,61
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu,holiday marmaris ate fit food. really good foo...,179,83
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere,price affordable. menu photo cost 108 liras. w...,31,19
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere,turkey's cheapest artisan restaurant food deli...,9,6
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu,know look term price performance point; taste;...,31,15


## Keep only English reviews

In [None]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/981.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=779e964b030f03bdb8b58be8ea199b2a417a0dc67154b5eaad17e36b306e7145
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f0

In [None]:
print("before cleaning shape:", reviews.shape)
from langdetect import detect
reviews["lang"] = reviews["text"].apply(lambda x: detect(x) if isinstance(x, str) else "unknown")
reviews = reviews[reviews["lang"] == "en"]  # keep English
print("After cleaning shape:", reviews.shape)

before cleaning shape: (1100, 9)
After cleaning shape: (1068, 10)


## Dropping unnecessary columns

In [None]:
# Dropping author_name and photo columns
cleaned_reviews = reviews.drop(["author_name", "photo", "lang"], axis=1)
cleaned_reviews.head()

Unnamed: 0,business_name,text,rating,rating_category,cleaned_text,text_length,cleaned_text_length
0,Haci'nin Yeri - Yigit Lokantasi,We went to Marmaris with my wife for a holiday...,5,taste,went marmaris wife holiday. chose restaurant p...,130,61
1,Haci'nin Yeri - Yigit Lokantasi,During my holiday in Marmaris we ate here to f...,4,menu,holiday marmaris ate fit food. really good foo...,179,83
2,Haci'nin Yeri - Yigit Lokantasi,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere,price affordable. menu photo cost 108 liras. w...,31,19
3,Haci'nin Yeri - Yigit Lokantasi,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere,turkey's cheapest artisan restaurant food deli...,9,6
4,Haci'nin Yeri - Yigit Lokantasi,I don't know what you will look for in terms o...,3,menu,know look term price performance point; taste;...,31,15


In [None]:
cleaned_reviews.to_csv("/content/drive/MyDrive/4awesome/Data/cleaned_reviews.csv", index=False)