In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = pd.read_csv('/kaggle/input/restaurant-reviews/Restaurant reviews.csv')

# Drops all rows after row 7500.

In [2]:
data = data.drop(data.index[7500:])

In [3]:
data.tail()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
7495,Faasos,Foodholic,Service 4/5\nVFM 4/5\nFood 4/5\n\nI was a grea...,4,"78 Reviews , 214 Followers",7/5/2018 13:51,1,
7496,Faasos,Clem,I wish Faasos would reconsider their prices. 1...,2,"27 Reviews , 3 Followers",7/4/2018 11:55,0,
7497,Faasos,Santhi Kumar,polite,4,2 Reviews,7/3/2018 23:14,0,
7498,Faasos,Sourav Chakraborty,everything was nice.. :),5,"1 Review , 25 Followers",7/3/2018 1:39,0,
7499,Faasos,Rakesh Manchikatla,worst,1,1 Review,7/2/2018 23:42,0,


# Discards all rows in 'Review' and 'Rating' columns with missing values.

In [4]:
data = data.dropna(subset=['Review', 'Rating'])

# Prints all unique values in 'Rating'.

In [5]:
print(data['Rating'].unique())

['5' '4' '1' '3' '2' '3.5' '4.5' '2.5' '1.5']


# Shows what datatype 'Rating' currently is.

In [6]:
data['Rating'].info()

<class 'pandas.core.series.Series'>
Index: 7496 entries, 0 to 7499
Series name: Rating
Non-Null Count  Dtype 
--------------  ----- 
7496 non-null   object
dtypes: object(1)
memory usage: 117.1+ KB


# Converts the data from object to float, then integer.

In [7]:
data['Rating'] = data['Rating'].astype(float)

In [8]:
data['Rating'] = data['Rating'].astype(int)

In [9]:
data['Rating'].info()

<class 'pandas.core.series.Series'>
Index: 7496 entries, 0 to 7499
Series name: Rating
Non-Null Count  Dtype
--------------  -----
7496 non-null   int64
dtypes: int64(1)
memory usage: 117.1 KB


In [10]:
print(data['Rating'].unique())

[5 4 1 3 2]


# Assigns the columns "Review" and "Rating" to x and y variables.

In [11]:
x = data["Review"]
y = data["Rating"]

In [12]:
print(x)

0       The ambience was good, food was quite good . h...
1       Ambience is too good for a pleasant evening. S...
2       A must try.. great food great ambience. Thnx f...
3       Soumen das and Arun was a great guy. Only beca...
4       Food is good.we ordered Kodi drumsticks and ba...
                              ...                        
7495    Service 4/5\nVFM 4/5\nFood 4/5\n\nI was a grea...
7496    I wish Faasos would reconsider their prices. 1...
7497                                               polite
7498                             everything was nice.. :)
7499                                                worst
Name: Review, Length: 7496, dtype: object


In [13]:
print(y)

0       5
1       5
2       5
3       5
4       5
       ..
7495    4
7496    2
7497    4
7498    5
7499    1
Name: Rating, Length: 7496, dtype: int64


# Turn our numerical data to categories.

In [14]:
category = []
for rating in y:
    if rating == 1:
        category.append('very bad')
    elif rating == 2:
        category.append('bad')
    elif rating == 3:
        category.append('neutral')
    elif rating == 4:
        category.append('good')
    else:
        category.append('excellent')

# Splits the data into training and testing sets.

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(x, category, test_size=0.2, random_state=42)

# Feature extraction to make the data compatible with our learning algorithm.

In [27]:
vector = CountVectorizer(min_df = 1, lowercase = True)

X_train_vectorize = vector.fit_transform(X_train)
X_test_vectorize = vector.transform(X_test)
Y_train_vectorize = vector.fit_transform(Y_train)
Y_test_vectorize = vector.fit_transform(Y_test)

# Logistic Regression is used due to the dataset's large size, as well as finding a correlation between the two features.

In [28]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train_vectorize, Y_train)

# View the accuracy of our training and testing data.

In [29]:
train_predict = model.predict(X_train_vectorize)

train_accuracy = accuracy_score(Y_train, train_predict)

print(train_accuracy)

0.9523015343562375


In [30]:
test_predict = model.predict(X_test_vectorize)

test_accuracy = accuracy_score(Y_test, test_predict)

print(test_accuracy)

0.626


In [21]:
Y_pred = model.predict(X_test_vectorize)

# Print a few rows comparing our training data with the predicted data.

In [22]:
results = pd.DataFrame({'Actual': Y_train, 'Predicted': train_predict})

print(results)

         Actual  Predicted
0     excellent  excellent
1     excellent  excellent
2           bad        bad
3     excellent  excellent
4          good       good
...         ...        ...
5991  excellent  excellent
5992  excellent  excellent
5993   very bad   very bad
5994    neutral    neutral
5995       good       good

[5996 rows x 2 columns]
