In [373]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [374]:
df = pd.read_csv("train.csv", encoding='ISO-8859-1', index_col='Id')
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text,Rating
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0
...,...,...,...,...
2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0
2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0
2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0
2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0


In [375]:
df.isna().sum(axis=0)

Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

Nan только в Review_Title, и их примерно 10%. Скорее всего Nan получилось потому что тот кто сотавлял эту графу для своего отеля оставил его пустым. Чтож сделаем тоже самое

In [376]:
df['Review_Title'] = df['Review_Title'].fillna("")
df.isna().sum(axis=0)

Hotel_name      0
Review_Title    0
Review_Text     0
Rating          0
dtype: int64

Приведём вообще все к нижнему регистру

In [377]:
for column in df.columns:
  if column != "Rating":
    df[column] = df[column].apply(lambda x: x.lower())

In [378]:
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text,Rating
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,park hyatt,refuge in chennai,excellent room and exercise facility. all arou...,80.0
1,hilton chennai,hilton chennai,very comfortable and felt safe. \r\nstaff were...,100.0
2,the royal regency,no worth the rating shown in websites. pricing...,not worth the rating shown. service is not goo...,71.0
3,rivera,good stay,"first of all nice & courteous staff, only one ...",86.0
4,park hyatt,needs improvement,overall ambience of the hotel is very good. in...,86.0
...,...,...,...,...
2346,hyatt regency chennai,,most impressive service by staff in all areas....,80.0
2347,new woodlands,homely villa,new woodlands chennai which gave me a homely e...,71.0
2348,samudra residency,nice accommodation and facilities,awesome i liked the neatness and maintenance. ...,100.0
2349,the residency chennai,the residency good centrally located hotel,the overall experience was good. however the w...,80.0


Попробуем уменьшить словарь

In [379]:
import re

def less_words(s, pattern_string):
  pattern = re.compile(pattern_string)
  return pattern.sub(' ', s)

In [380]:
def kill_smth(s, smth):
  for pr in smth:
    s = less_words(s, f'(^{pr} )|( {pr} )|( {pr}$)|(^{pr}$)')
  return s

In [381]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')

stemmer = SnowballStemmer(language='english')
bad = set(stopwords.words('english'))

def kill_all(s):
  s = re.sub('[^a-z\s]', '', s) # Оставляем буквы и пробелы
  s = kill_smth(s, bad) # Убиваем всё плохое
  s = ' '.join([stemmer.stem(word) for word in s.split()]) # Убираем окончания
  return s

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [382]:
def count_str(count_words):
  def count_words_func(s, c=count_words):
    c |= set(list(s.split()))
    return s
  return count_words_func

In [383]:
count_words = set()
cat_columns = df.columns[:-1]
for column in cat_columns:
  df[column].apply(count_str(count_words))
len(count_words)

7486

In [384]:
for column in cat_columns:
  df[column] = df[column].apply(lambda x: x.lower())

In [385]:
for column in cat_columns:
  df[column] = df[column].apply(kill_all)

In [386]:
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text,Rating
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,park hyatt,refug chennai,excel room exercis facil around atmospher calm...,80.0
1,hilton chennai,hilton chennai,comfort felt safe staff help respect breakfast...,100.0
2,royal regenc,worth rate shown websit price ok,worth rate shown servic good room well maintai...,71.0
3,rivera,good stay,first nice courteous staff one con stay time c...,86.0
4,park hyatt,need improv,overal ambienc hotel good room facil need impr...,86.0
...,...,...,...,...
2346,hyatt regenc chennai,,impress servic staff area good restaur fit cen...,80.0
2347,new woodland,home villa,new woodland chennai gave home experi luxuri t...,71.0
2348,samudra resid,nice accommod facil,awesom like neat mainten facil reason price ov...,100.0
2349,resid chennai,resid good central locat hotel,overal experi good howev wifi get disconnect o...,80.0


In [387]:
count_words = set()
for column in cat_columns:
  df[column].apply(count_str(count_words))
len(count_words)

3767

In [388]:
3767 / 7486

0.5032059845044082

Убрали примерно половину. Предлагаю на этом остановиться и начать обучаться

Применим word2vec

In [389]:
import io
import numpy as np

from tqdm import tqdm
from itertools import islice


def load_vectors(fname, limit):
    fin = io.open(fname, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(islice(fin, limit), total = limit):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [390]:
# Чтобы каждый раз не срабатывало
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -O crawl-300d-2M.vec.zip
# !unzip crawl-300d-2M.vec.zip
# vecs = load_vectors('crawl-300d-2M.vec', 100000)

In [391]:
def make_vector(s):
  vector = np.zeros(300)
  x = s.split()
  for w in x:
    vector += vecs.get(w, np.zeros(300))
  vector /= len(x)
  return vector * 100 # Чтобы не терять потом в точности

In [392]:
df_train = pd.DataFrame(df["Rating"])
for column in cat_columns:
  ser = df[column].apply(make_vector)
  table = np.zeros(300)
  for elem in ser:
    table = np.vstack((table, elem))
  new_columns = np.array([column + str(i) for i in range(table.shape[1])])
  df_train = pd.concat((pd.DataFrame(columns=new_columns, data=table[1:, :]), df_train), axis=1)
df_train

Unnamed: 0,Review_Text0,Review_Text1,Review_Text2,Review_Text3,Review_Text4,Review_Text5,Review_Text6,Review_Text7,Review_Text8,Review_Text9,...,Hotel_name291,Hotel_name292,Hotel_name293,Hotel_name294,Hotel_name295,Hotel_name296,Hotel_name297,Hotel_name298,Hotel_name299,Rating
0,-2.523043,-4.808696,-2.669565,3.490870,-8.296957,-1.488261,-6.276957,1.327826,11.140435,-0.710435,...,-4.235,4.505000,7.0750,-4.180,19.850000,-4.810000,-18.250,-1.425000,-7.0650,80.0
1,6.561765,-2.072941,-3.437059,-0.265882,-13.128235,-1.404118,-11.465294,-1.945294,2.105882,1.211765,...,25.995,4.045000,9.2250,6.225,6.070000,-15.370000,-6.210,-10.355000,-6.2550,100.0
2,-3.357143,3.380357,-0.737143,4.601429,-10.535714,-3.116786,-8.508571,5.278571,13.288571,-7.075357,...,-13.435,-21.765000,-3.4600,-22.480,3.250000,2.765000,0.295,3.040000,9.7000,71.0
3,-1.820476,-10.354762,-3.262381,4.927619,-8.604286,-0.442381,-2.608095,7.840952,-0.433333,-6.020952,...,0.000,0.000000,0.0000,0.000,0.000000,0.000000,0.000,0.000000,0.0000,86.0
4,1.038696,2.517826,-1.080000,-0.612609,-1.603478,-2.588261,-1.599130,6.686087,9.157826,-4.762174,...,-4.235,4.505000,7.0750,-4.180,19.850000,-4.810000,-18.250,-1.425000,-7.0650,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,-0.537222,2.010278,4.643056,2.795833,-8.768611,-0.582500,-1.358889,2.757500,7.488056,-3.872222,...,17.330,2.696667,6.1500,4.150,4.046667,-10.246667,-4.140,-6.903333,-4.1700,80.0
2347,-4.697778,-7.258889,-3.885833,-0.528056,-7.750556,1.158611,-5.636667,1.246389,11.302778,-2.721111,...,-21.555,-1.770000,14.2250,10.805,12.630000,-2.075000,2.190,1.240000,-16.0150,71.0
2348,-2.736316,-1.814737,-3.527895,2.110000,-13.991579,-3.124211,1.736316,2.931579,5.988947,-8.852105,...,0.000,0.000000,0.0000,0.000,0.000000,0.000000,0.000,0.000000,0.0000,100.0
2349,1.748182,1.126364,-4.137273,-4.776364,1.865455,-2.760000,-8.265455,10.003636,3.875455,0.384545,...,25.995,4.045000,9.2250,6.225,6.070000,-15.370000,-6.210,-10.355000,-6.2550,80.0


In [393]:
from sklearn.model_selection import train_test_split

X = df_train.drop('Rating', axis=1)
y = df_train['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=181)
X

Unnamed: 0,Review_Text0,Review_Text1,Review_Text2,Review_Text3,Review_Text4,Review_Text5,Review_Text6,Review_Text7,Review_Text8,Review_Text9,...,Hotel_name290,Hotel_name291,Hotel_name292,Hotel_name293,Hotel_name294,Hotel_name295,Hotel_name296,Hotel_name297,Hotel_name298,Hotel_name299
0,-2.523043,-4.808696,-2.669565,3.490870,-8.296957,-1.488261,-6.276957,1.327826,11.140435,-0.710435,...,-3.440,-4.235,4.505000,7.0750,-4.180,19.850000,-4.810000,-18.250,-1.425000,-7.0650
1,6.561765,-2.072941,-3.437059,-0.265882,-13.128235,-1.404118,-11.465294,-1.945294,2.105882,1.211765,...,-4.740,25.995,4.045000,9.2250,6.225,6.070000,-15.370000,-6.210,-10.355000,-6.2550
2,-3.357143,3.380357,-0.737143,4.601429,-10.535714,-3.116786,-8.508571,5.278571,13.288571,-7.075357,...,18.735,-13.435,-21.765000,-3.4600,-22.480,3.250000,2.765000,0.295,3.040000,9.7000
3,-1.820476,-10.354762,-3.262381,4.927619,-8.604286,-0.442381,-2.608095,7.840952,-0.433333,-6.020952,...,0.000,0.000,0.000000,0.0000,0.000,0.000000,0.000000,0.000,0.000000,0.0000
4,1.038696,2.517826,-1.080000,-0.612609,-1.603478,-2.588261,-1.599130,6.686087,9.157826,-4.762174,...,-3.440,-4.235,4.505000,7.0750,-4.180,19.850000,-4.810000,-18.250,-1.425000,-7.0650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,-0.537222,2.010278,4.643056,2.795833,-8.768611,-0.582500,-1.358889,2.757500,7.488056,-3.872222,...,-3.160,17.330,2.696667,6.1500,4.150,4.046667,-10.246667,-4.140,-6.903333,-4.1700
2347,-4.697778,-7.258889,-3.885833,-0.528056,-7.750556,1.158611,-5.636667,1.246389,11.302778,-2.721111,...,18.060,-21.555,-1.770000,14.2250,10.805,12.630000,-2.075000,2.190,1.240000,-16.0150
2348,-2.736316,-1.814737,-3.527895,2.110000,-13.991579,-3.124211,1.736316,2.931579,5.988947,-8.852105,...,0.000,0.000,0.000000,0.0000,0.000,0.000000,0.000000,0.000,0.000000,0.0000
2349,1.748182,1.126364,-4.137273,-4.776364,1.865455,-2.760000,-8.265455,10.003636,3.875455,0.384545,...,-4.740,25.995,4.045000,9.2250,6.225,6.070000,-15.370000,-6.210,-10.355000,-6.2550


In [394]:
for column in X_train.columns:
  X_train[column] = X_train[column].fillna(0)
  X_test[column] = X_test[column].fillna(0)
# Я не понимаю где они и ка, но Лин. Регрессия ругаеться

Теперь мы получили обычный датасет с числами обучим его через CatBoost

In [395]:
#!pip install catboost
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

clf = CatBoostRegressor()
clf.fit(X_train, y_train)
print(r2_score(y_test, clf.predict(X_test)))

Learning rate set to 0.045237
0:	learn: 20.8968475	total: 381ms	remaining: 6m 20s
1:	learn: 20.7066039	total: 609ms	remaining: 5m 3s
2:	learn: 20.5251833	total: 839ms	remaining: 4m 38s
3:	learn: 20.3285077	total: 1.05s	remaining: 4m 22s
4:	learn: 20.1560960	total: 1.27s	remaining: 4m 12s
5:	learn: 19.9976440	total: 1.48s	remaining: 4m 5s
6:	learn: 19.8264555	total: 1.69s	remaining: 4m
7:	learn: 19.6787777	total: 1.92s	remaining: 3m 57s
8:	learn: 19.5144219	total: 2.13s	remaining: 3m 54s
9:	learn: 19.3809488	total: 2.35s	remaining: 3m 52s
10:	learn: 19.2362892	total: 2.58s	remaining: 3m 51s
11:	learn: 19.0906867	total: 2.79s	remaining: 3m 49s
12:	learn: 18.9697646	total: 3.02s	remaining: 3m 49s
13:	learn: 18.8374850	total: 3.23s	remaining: 3m 47s
14:	learn: 18.7144361	total: 3.45s	remaining: 3m 46s
15:	learn: 18.5916728	total: 3.67s	remaining: 3m 45s
16:	learn: 18.4849766	total: 3.89s	remaining: 3m 45s
17:	learn: 18.3582026	total: 4.11s	remaining: 3m 44s
18:	learn: 18.2438629	total: 4.3

Теперь обычную регрессию

In [396]:
from sklearn.linear_model import LinearRegression

lg = LinearRegression()
lg.fit(X_train, y_train)
print(r2_score(y_test, lg.predict(X_test)))

-4.1382299025748276e+20


Оу шит, всё очень плохо

Поробуем без отелей, так как их названия наверное больше мешают чем помогают

In [397]:
df_train_with_out_names = pd.DataFrame(df["Rating"])
for column in cat_columns:
  if column != "Hotel_name":
    ser = df[column].apply(make_vector)
    table = np.zeros(300)
    for elem in ser:
      table = np.vstack((table, elem))
    new_columns = np.array([column + str(i) for i in range(table.shape[1])])
    df_train_with_out_names = pd.concat((pd.DataFrame(columns=new_columns, data=table[1:, :]), df_train_with_out_names), axis=1)
df_train_with_out_names

Unnamed: 0,Review_Text0,Review_Text1,Review_Text2,Review_Text3,Review_Text4,Review_Text5,Review_Text6,Review_Text7,Review_Text8,Review_Text9,...,Review_Title291,Review_Title292,Review_Title293,Review_Title294,Review_Title295,Review_Title296,Review_Title297,Review_Title298,Review_Title299,Rating
0,-2.523043,-4.808696,-2.669565,3.490870,-8.296957,-1.488261,-6.276957,1.327826,11.140435,-0.710435,...,25.995000,4.045000,9.225000,6.225000,6.0700,-15.370000,-6.210000,-10.355000,-6.255000,80.0
1,6.561765,-2.072941,-3.437059,-0.265882,-13.128235,-1.404118,-11.465294,-1.945294,2.105882,1.211765,...,25.995000,4.045000,9.225000,6.225000,6.0700,-15.370000,-6.210000,-10.355000,-6.255000,100.0
2,-3.357143,3.380357,-0.737143,4.601429,-10.535714,-3.116786,-8.508571,5.278571,13.288571,-7.075357,...,0.301667,-7.431667,0.886667,-3.901667,-0.5200,-5.791667,6.360000,-3.233333,2.823333,71.0
3,-1.820476,-10.354762,-3.262381,4.927619,-8.604286,-0.442381,-2.608095,7.840952,-0.433333,-6.020952,...,2.620000,-8.480000,-6.860000,9.405000,-15.2750,1.670000,8.585000,-14.585000,2.530000,86.0
4,1.038696,2.517826,-1.080000,-0.612609,-1.603478,-2.588261,-1.599130,6.686087,9.157826,-4.762174,...,-27.065000,1.520000,-3.860000,-8.420000,10.0150,-1.780000,-12.775000,-15.475000,-6.170000,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,-0.537222,2.010278,4.643056,2.795833,-8.768611,-0.582500,-1.358889,2.757500,7.488056,-3.872222,...,,,,,,,,,,80.0
2347,-4.697778,-7.258889,-3.885833,-0.528056,-7.750556,1.158611,-5.636667,1.246389,11.302778,-2.721111,...,22.820000,-22.930000,-1.355000,8.245000,42.4700,-8.005000,9.445000,8.735000,0.260000,71.0
2348,-2.736316,-1.814737,-3.527895,2.110000,-13.991579,-3.124211,1.736316,2.931579,5.988947,-8.852105,...,2.316667,3.603333,2.286667,5.583333,-13.6900,-2.260000,-1.913333,-4.946667,-5.740000,100.0
2349,1.748182,1.126364,-4.137273,-4.776364,1.865455,-2.760000,-8.265455,10.003636,3.875455,0.384545,...,3.488000,-14.288000,8.232000,0.032000,-1.7140,1.290000,-0.886000,1.590000,0.780000,80.0


In [398]:
from sklearn.model_selection import train_test_split

X = df_train_with_out_names.drop('Rating', axis=1)
y = df_train_with_out_names['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=181)
X

Unnamed: 0,Review_Text0,Review_Text1,Review_Text2,Review_Text3,Review_Text4,Review_Text5,Review_Text6,Review_Text7,Review_Text8,Review_Text9,...,Review_Title290,Review_Title291,Review_Title292,Review_Title293,Review_Title294,Review_Title295,Review_Title296,Review_Title297,Review_Title298,Review_Title299
0,-2.523043,-4.808696,-2.669565,3.490870,-8.296957,-1.488261,-6.276957,1.327826,11.140435,-0.710435,...,-4.740000,25.995000,4.045000,9.225000,6.225000,6.0700,-15.370000,-6.210000,-10.355000,-6.255000
1,6.561765,-2.072941,-3.437059,-0.265882,-13.128235,-1.404118,-11.465294,-1.945294,2.105882,1.211765,...,-4.740000,25.995000,4.045000,9.225000,6.225000,6.0700,-15.370000,-6.210000,-10.355000,-6.255000
2,-3.357143,3.380357,-0.737143,4.601429,-10.535714,-3.116786,-8.508571,5.278571,13.288571,-7.075357,...,-9.350000,0.301667,-7.431667,0.886667,-3.901667,-0.5200,-5.791667,6.360000,-3.233333,2.823333
3,-1.820476,-10.354762,-3.262381,4.927619,-8.604286,-0.442381,-2.608095,7.840952,-0.433333,-6.020952,...,4.830000,2.620000,-8.480000,-6.860000,9.405000,-15.2750,1.670000,8.585000,-14.585000,2.530000
4,1.038696,2.517826,-1.080000,-0.612609,-1.603478,-2.588261,-1.599130,6.686087,9.157826,-4.762174,...,-0.535000,-27.065000,1.520000,-3.860000,-8.420000,10.0150,-1.780000,-12.775000,-15.475000,-6.170000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,-0.537222,2.010278,4.643056,2.795833,-8.768611,-0.582500,-1.358889,2.757500,7.488056,-3.872222,...,,,,,,,,,,
2347,-4.697778,-7.258889,-3.885833,-0.528056,-7.750556,1.158611,-5.636667,1.246389,11.302778,-2.721111,...,-0.735000,22.820000,-22.930000,-1.355000,8.245000,42.4700,-8.005000,9.445000,8.735000,0.260000
2348,-2.736316,-1.814737,-3.527895,2.110000,-13.991579,-3.124211,1.736316,2.931579,5.988947,-8.852105,...,-3.796667,2.316667,3.603333,2.286667,5.583333,-13.6900,-2.260000,-1.913333,-4.946667,-5.740000
2349,1.748182,1.126364,-4.137273,-4.776364,1.865455,-2.760000,-8.265455,10.003636,3.875455,0.384545,...,-8.098000,3.488000,-14.288000,8.232000,0.032000,-1.7140,1.290000,-0.886000,1.590000,0.780000


In [399]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

clf_with_out_names = CatBoostRegressor()
clf_with_out_names.fit(X_train, y_train)
print(r2_score(y_test, clf_with_out_names.predict(X_test)))

Learning rate set to 0.045237
0:	learn: 20.8660226	total: 569ms	remaining: 9m 28s
1:	learn: 20.6890969	total: 858ms	remaining: 7m 8s
2:	learn: 20.4776349	total: 1.17s	remaining: 6m 28s
3:	learn: 20.2981839	total: 1.75s	remaining: 7m 15s
4:	learn: 20.1075777	total: 2.19s	remaining: 7m 15s
5:	learn: 19.9376175	total: 2.49s	remaining: 6m 52s
6:	learn: 19.7666869	total: 2.77s	remaining: 6m 33s
7:	learn: 19.6310119	total: 3.02s	remaining: 6m 14s
8:	learn: 19.4719227	total: 3.33s	remaining: 6m 6s
9:	learn: 19.3326202	total: 3.59s	remaining: 5m 55s
10:	learn: 19.1942578	total: 3.85s	remaining: 5m 45s
11:	learn: 19.0717153	total: 4.11s	remaining: 5m 38s
12:	learn: 18.9138374	total: 4.38s	remaining: 5m 32s
13:	learn: 18.7948309	total: 4.63s	remaining: 5m 25s
14:	learn: 18.6790548	total: 4.89s	remaining: 5m 21s
15:	learn: 18.5467504	total: 5.16s	remaining: 5m 17s
16:	learn: 18.4202669	total: 5.45s	remaining: 5m 15s
17:	learn: 18.3020815	total: 5.71s	remaining: 5m 11s
18:	learn: 18.1910358	total:

In [401]:
for column in X_train.columns:
  X_train[column] = X_train[column].fillna(0)
  X_test[column] = X_test[column].fillna(0)

In [402]:
lg_with_out_names = LinearRegression()
lg_with_out_names.fit(X_train, y_train)
print(r2_score(y_test, lg_with_out_names.predict(X_test)))

-0.1142060908434932


Ну получше конечно, но хуже чем константа)

Лучший был Catbbost с именами

Получим результат для теста

In [404]:
df = pd.read_csv("test.csv", encoding='ISO-8859-1', index_col='Id')
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2351,ITC Grand Chola,Mr Neeraj,On the night of my arrival from NY I had a min...
2352,Hotel Pandian,,Not so great. But it is still acceptable. Bit...
2353,Oyo Rooms Guindy Olympia Tech Park,Nice stay for corporate people,Been a good place to stay for people who visit...
2354,OYO Apartments Saidapet,Average hotel,Not worth of the money we paid.worst ac.no wat...
2355,Ramada Chennai Egmore,A good mid range corporate hotel,"A well located hotel, with decent sized rooms ..."
...,...,...,...
4698,Lemon Tree Chennai,Average stay,"Compared to other lemon tree stay, this was bi..."
4699,Oyo Rooms T Nagar Off Pondy Bazaar,location is not good. rude behavior. staff no ...,unpleasant stay. not easy task to reach. Morni...
4700,VGP Golden Beach Resort,,Quality of service is too bad. We arrived 12.3...
4701,The Park Chennai,Over rated and overpriced Hotel,I am not sure why someone wants to spend that ...


In [405]:
df['Review_Title'] = df['Review_Title'].fillna("")
for column in df.columns:
  if column != "Rating":
    df[column] = df[column].apply(lambda x: x.lower())
for column in cat_columns:
  df[column] = df[column].apply(kill_all)
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2351,itc grand chola,mr neeraj,night arriv ny minor problem room mr neeraj ex...
2352,hotel pandian,,great still accept bit high money wise
2353,oyo room guindi olympia tech park,nice stay corpor peopl,good place stay peopl visit olympia tech park ...
2354,oyo apart saidapet,averag hotel,worth money paidworst acno waterno clean room
2355,ramada chennai egmor,good mid rang corpor hotel,well locat hotel decent size room bit date dec...
...,...,...,...
4698,lemon tree chennai,averag stay,compar lemon tree stay bit disappoint compar r...
4699,oyo room nagar pondi bazaar,locat good rude behavior staff help,unpleas stay easi task reach morn bf good coff...
4700,vgp golden beach resort,,qualiti servic bad arriv pm actual check time ...
4701,park chennai,rate overpr hotel,sure someon want spend kind money disappoint


In [406]:
df_test = None
for column in cat_columns:
  ser = df[column].apply(make_vector)
  table = np.zeros(300)
  for elem in ser:
    table = np.vstack((table, elem))
  new_columns = np.array([column + str(i) for i in range(table.shape[1])])
  if (df_test is None):
    df_test = pd.DataFrame(columns=new_columns, data=table[1:, :])
  else:
    df_test = pd.concat((pd.DataFrame(columns=new_columns, data=table[1:, :]), df_test), axis=1)
df_test

Unnamed: 0,Review_Text0,Review_Text1,Review_Text2,Review_Text3,Review_Text4,Review_Text5,Review_Text6,Review_Text7,Review_Text8,Review_Text9,...,Hotel_name290,Hotel_name291,Hotel_name292,Hotel_name293,Hotel_name294,Hotel_name295,Hotel_name296,Hotel_name297,Hotel_name298,Hotel_name299
0,-1.893929,-5.764286,-2.848214,1.462857,-6.002857,2.708571,-3.171071,-2.551429,-3.741429,-0.977143,...,-4.850000,-3.573333,-10.093333,-6.316667,-2.370000,-2.283333,2.016667,6.230000,-3.900000,6.586667
1,-2.405714,-1.162857,-5.282857,-12.121429,-13.107143,-7.002857,-21.137143,17.011429,8.607143,-7.455714,...,-10.030000,19.885000,-12.165000,3.015000,-10.605000,23.945000,2.060000,11.255000,2.910000,6.950000
2,0.311471,-11.540000,-1.586765,1.725588,-9.910000,0.488529,-1.474706,5.800882,8.950882,0.475000,...,-2.390000,-10.470000,-2.031667,4.091667,2.805000,15.958333,-2.635000,-7.136667,4.590000,-6.260000
3,4.168571,1.297143,-3.740000,0.750000,-15.172857,-1.108571,-1.948571,14.368571,17.231429,-3.197143,...,-4.176667,-2.290000,2.330000,2.566667,-2.466667,2.023333,8.686667,0.950000,-19.276667,9.680000
4,0.657742,-5.105161,-0.637742,-4.252903,-9.109677,-3.531935,-5.973871,0.807419,5.355806,-2.013871,...,-3.160000,17.330000,2.696667,6.150000,4.150000,4.046667,-10.246667,-4.140000,-6.903333,-4.170000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2347,-1.110000,-3.378889,-0.758889,8.493333,-15.328889,-4.816667,-3.071111,6.308889,10.051111,-0.907778,...,-0.340000,19.900000,20.113333,9.250000,11.323333,27.346667,2.080000,-6.840000,8.000000,14.763333
2348,-1.145333,-9.578000,-11.784000,-1.658000,-2.284000,-1.305333,-9.046000,-0.218667,14.929333,0.356000,...,-7.572000,-2.674000,-10.052000,-0.544000,2.034000,12.358000,2.338000,0.286000,-1.460000,-7.652000
2349,0.275333,-16.092000,-8.530000,-5.988667,5.724000,-4.068667,6.351333,-4.772000,0.799333,-6.534667,...,-6.837500,14.657500,-14.015000,1.640000,-1.865000,18.220000,4.035000,-13.932500,-4.910000,26.147500
2350,-2.361429,-17.808571,3.395714,10.097143,-13.394286,-8.151429,-6.961429,18.958571,12.438571,-2.695714,...,-8.180000,21.760000,8.550000,16.300000,2.045000,25.920000,-20.180000,-24.460000,-11.780000,-13.320000


In [407]:
y_pred = clf.predict(df_test)
y_pred

array([98.84226959, 65.32309957, 81.43177904, ..., 47.51806007,
       40.64928677, 79.94334908])

In [415]:
res = pd.Series(y_pred, index=range(2351, 4703))
res

2351    98.842270
2352    65.323100
2353    81.431779
2354    55.841517
2355    76.779257
          ...    
4698    76.871731
4699    66.193430
4700    47.518060
4701    40.649287
4702    79.943349
Length: 2352, dtype: float64

In [416]:
df['Rating'] = res
df

Unnamed: 0_level_0,Hotel_name,Review_Title,Review_Text,Rating
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2351,itc grand chola,mr neeraj,night arriv ny minor problem room mr neeraj ex...,98.842270
2352,hotel pandian,,great still accept bit high money wise,65.323100
2353,oyo room guindi olympia tech park,nice stay corpor peopl,good place stay peopl visit olympia tech park ...,81.431779
2354,oyo apart saidapet,averag hotel,worth money paidworst acno waterno clean room,55.841517
2355,ramada chennai egmor,good mid rang corpor hotel,well locat hotel decent size room bit date dec...,76.779257
...,...,...,...,...
4698,lemon tree chennai,averag stay,compar lemon tree stay bit disappoint compar r...,76.871731
4699,oyo room nagar pondi bazaar,locat good rude behavior staff help,unpleas stay easi task reach morn bf good coff...,66.193430
4700,vgp golden beach resort,,qualiti servic bad arriv pm actual check time ...,47.518060
4701,park chennai,rate overpr hotel,sure someon want spend kind money disappoint,40.649287


In [419]:
df.to_csv('ans.csv', index=False)

In [422]:
from google.colab import files
files.download('ans.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>