In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

%matplotlib inline

# EXTRACTING GENERAL INFORMATION

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
Review_data = pd.read_csv("train.csv")

In [None]:
Review_data.shape

In [None]:
Review_data.head()

In [None]:
Review_data.info()

In [None]:
Review_data.describe().transpose()

# DATA CLEANING

In [None]:
count = Review_data.isnull().sum().sort_values(ascending=False)
percentage = ((Review_data.isnull().sum()/len(Review_data)*100)).sort_values(ascending=False)
missing_data = pd.concat([count,percentage], axis=1, keys=['Count','Percentage'])
print(' Count and Percentage of missing values for the columns : ')
missing_data

In [None]:
print(' Percentage for Default \n')
print(round(Review_data.Is_Response.value_counts(normalize=True)*100,2))
round(Review_data.Is_Response.value_counts(normalize=True)*100,2).plot(kind='bar')
plt.title('Percentage Distribution by review type')
plt.show()

In [None]:
Review_data.drop(columns=['User_ID', 'Browser_Used', 'Device_Used'],inplace=True)

In [None]:
import re
import string

def text_clean_1(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

cleaned_1 = lambda x: text_clean_1(x)

In [None]:
Review_data['Clean Description'] = pd.DataFrame(Review_data.Description.apply(cleaned_1))
Review_data.head(10)

In [None]:
def text_clean_2(text):
  text = re.sub('[''""...]', '', text)
  text = re.sub('\n', '', text)
  return text

cleaned_2 = lambda x: text_clean_2(x)

In [None]:
Review_data['New_Clean_Description'] = pd.DataFrame(Review_data['Clean Description'].apply(cleaned_2))
Review_data.head(10)

# MODEL TRAINING

In [None]:
from sklearn.model_selection import train_test_split

X = Review_data.New_Clean_Description
Y = Review_data.Is_Response

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 101)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = 'lbfgs')

from sklearn.pipeline import Pipeline

In [None]:
model = Pipeline([('vectorizer', tvec), ('classifier', clf2)])
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

pred = model.predict(x_test)
confusion_matrix(pred,y_test)

# MODEL PREDICTION DATA

In [None]:
classification_report(y_test, pred)

In [None]:
from sklearn.metrics import accuracy_score, precision_score
print("Accuracy: ", accuracy_score(pred,y_test))
print("Precision: ", precision_score(pred,y_test, average="weighted"))

# TRYING ON UNFORESEEN DATA

In [None]:
review_1 = ["I'm not satisfied with the management"]    # Review from User 1
res = model.predict(review_1)
res

In [None]:
review_2 = ["It was fantastic"]     # Review from User 2
res = model.predict(review_2)
res

# G.U.I. APPLICATION

In [None]:
#@title Hotel Review {run : "auto"}
Give_Review =  "the stay was qui" #@param {type:"string"}
res = model.predict([Give_Review])
res