In [86]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from sklearn.linear_model import SGDClassifier, SGDRegressor, LinearRegression
from sklearn import metrics
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import WordNetLemmatizer
from nltk import wordnet, pos_tag
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/kryzhikov/Sample-ML-Repo/main/11.%20Texts/nlp/train.csv', encoding='unicode_escape')

In [3]:
df = df.drop(['Id'], axis=1)

In [4]:
df

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating
0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0
...,...,...,...,...
2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0
2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0
2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0
2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0


# Новый раздел

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
sw_eng = set(stopwords.words('english'))

In [7]:
df1 = df.copy()

In [8]:
for i in range(df1.shape[0]):
  df1.at[i, 'Review_Text'] = df1.at[i, 'Review_Text'].lower()
  df1.at[i, 'rev_conv'] = ' '.join([word for word in df1.at[i, 'Review_Text'].split() if not word in sw_eng])

In [9]:
df1

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating,rev_conv
0,Park Hyatt,Refuge in Chennai,excellent room and exercise facility. all arou...,80.0,excellent room exercise facility. around atmos...
1,Hilton Chennai,Hilton Chennai,very comfortable and felt safe. \r\nstaff were...,100.0,comfortable felt safe. staff helpful respectfu...
2,The Royal Regency,No worth the rating shown in websites. Pricing...,not worth the rating shown. service is not goo...,71.0,worth rating shown. service good. room well ma...
3,Rivera,Good stay,"first of all nice & courteous staff, only one ...",86.0,"first nice & courteous staff, one con stay tim..."
4,Park Hyatt,Needs improvement,overall ambience of the hotel is very good. in...,86.0,overall ambience hotel good. room facilities n...
...,...,...,...,...,...
2346,Hyatt Regency Chennai,,most impressive service by staff in all areas....,80.0,impressive service staff areas. good restauran...
2347,New Woodlands,Homely villa,new woodlands chennai which gave me a homely e...,71.0,new woodlands chennai gave homely experience l...
2348,Samudra Residency,Nice accommodation and facilities,awesome i liked the neatness and maintenance. ...,100.0,awesome liked neatness maintenance. facilities...
2349,The Residency Chennai,The Residency Good Centrally located Hotel,the overall experience was good. however the w...,80.0,overall experience good. however wi-fi getting...


In [10]:
ex = r'[^\w\s]'
parser = re.compile(ex)

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')

In [12]:
d = {}
for i in range(df1.shape[0]):
  tmp = parser.sub(r'', df1.at[i, 'rev_conv'])
  df1.at[i, 'rev_conv'] = tmp
  df1.at[i, 'rev_stem'] = ' '.join([stemmer.stem(word) for word in tmp.split()])
  for i in [stemmer.stem(word) for word in tmp.split()]:
    if d.get(i, 0):
      d[i] += 1
    else:
      d[i] = 1

In [13]:
sort = sorted(d, key=d.get, reverse=True)
sort

['hotel',
 'good',
 'room',
 'stay',
 'servic',
 'staff',
 'food',
 'locat',
 'clean',
 'nice',
 'breakfast',
 'place',
 'great',
 'also',
 'time',
 'experi',
 'chennai',
 'facil',
 'excel',
 'overal',
 'provid',
 'help',
 'like',
 'friend',
 'restaur',
 'one',
 'need',
 'money',
 'would',
 'comfort',
 'well',
 'recommend',
 'near',
 'book',
 'visit',
 'busi',
 'work',
 'valu',
 'day',
 'citi',
 'realli',
 'check',
 'airport',
 'avail',
 'us',
 'best',
 'bad',
 'qualiti',
 'wifi',
 'enjoy',
 'night',
 'famili',
 'get',
 'recept',
 'small',
 'price',
 'bathroom',
 'travel',
 'improv',
 'neat',
 'even',
 'close',
 'maintain',
 'pleasant',
 'ok',
 'poor',
 'water',
 'look',
 'peopl',
 'much',
 'worth',
 'bed',
 'area',
 'ac',
 'old',
 'go',
 'spacious',
 'averag',
 'cleanli',
 'park',
 'everyth',
 'lot',
 '2',
 'decent',
 'trip',
 'awesom',
 'better',
 'buffet',
 'thing',
 'complimentari',
 'courteous',
 'make',
 'though',
 'front',
 'pool',
 'custom',
 'manag',
 'conveni',
 'amen',
 'com

In [14]:
good_words = ['good', 'clean', 'nice', 'great', 'excel', 'provid', 'like', 'enjoy', 'amaz']
bad_words = ['overal', 'price', 'problem', 'worst', 'old', 'expens', 'late']
df1[good_words] = np.zeros([df1.shape[0], len(good_words)], int)
df1[bad_words] = np.zeros([df1.shape[0], len(bad_words)], int)
words = np.concatenate((good_words, bad_words))

In [15]:
for i in range(df1.shape[0]):
  for j in words:
    parser = re.compile(j)
    if re.findall(j, df1.at[i, 'rev_stem']):
      df1.at[i, j] = 1

In [16]:
df1['good'].values

array([0, 0, 1, ..., 1, 1, 1])

In [17]:
df1

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating,rev_conv,rev_stem,good,clean,nice,great,...,like,enjoy,amaz,overal,price,problem,worst,old,expens,late
0,Park Hyatt,Refuge in Chennai,excellent room and exercise facility. all arou...,80.0,excellent room exercise facility around atmosp...,excel room exercis facil around atmospher calm...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hilton Chennai,Hilton Chennai,very comfortable and felt safe. \r\nstaff were...,100.0,comfortable felt safe staff helpful respectful...,comfort felt safe staff help respect breakfast...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,The Royal Regency,No worth the rating shown in websites. Pricing...,not worth the rating shown. service is not goo...,71.0,worth rating shown service good room well main...,worth rate shown servic good room well maintai...,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rivera,Good stay,"first of all nice & courteous staff, only one ...",86.0,first nice courteous staff one con stay time ...,first nice courteous staff one con stay time c...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Park Hyatt,Needs improvement,overall ambience of the hotel is very good. in...,86.0,overall ambience hotel good room facilities ne...,overal ambienc hotel good room facil need impr...,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,Hyatt Regency Chennai,,most impressive service by staff in all areas....,80.0,impressive service staff areas good restaurant...,impress servic staff area good restaur fit cen...,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2347,New Woodlands,Homely villa,new woodlands chennai which gave me a homely e...,71.0,new woodlands chennai gave homely experience l...,new woodland chennai gave home experi luxuri t...,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2348,Samudra Residency,Nice accommodation and facilities,awesome i liked the neatness and maintenance. ...,100.0,awesome liked neatness maintenance facilities ...,awesom like neat mainten facil reason price ov...,1,0,0,0,...,1,0,0,1,1,0,0,0,0,0
2349,The Residency Chennai,The Residency Good Centrally located Hotel,the overall experience was good. however the w...,80.0,overall experience good however wifi getting d...,overal experi good howev wifi get disconnect o...,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [18]:
avg_rat = df1.groupby('Hotel_name')['Rating'].describe()['mean']

In [19]:
for i in range(df1.shape[0]):
  df1.at[i, 'avg'] = avg_rat[df1.at[i, 'Hotel_name']]

In [20]:
X = df1.drop(['Hotel_name', 'Review_Title', 'Review_Text', 'rev_conv', 'rev_stem', 'Rating'], axis=1)
y = df1['Rating']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [22]:
X_train_sc = X_train.copy()
X_test_sc = X_test.copy()
for i in X_train.columns:
    X_train_sc[i] = (X_train_sc[i] - X_train_sc[i].mean())/X_train_sc[i].std()
    X_test_sc[i] = (X_test_sc[i] - X_test_sc[i].mean())/X_test_sc[i].std()

Закончили с data preprocessing, обучаем на Random forest

In [23]:
md = GridSearchCV(estimator = RandomForestRegressor(), param_grid = {'n_estimators': np.arange(50, 301, 100), 'max_depth': np.arange(5, 20, 6), 'max_features': np.arange(0.1, 0.8, 0.2),'random_state': [42]}, refit = True, scoring = 'neg_root_mean_squared_error', n_jobs =-1, cv = 5)
md.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': array([ 5, 11, 17]),
                         'max_features': array([0.1, 0.3, 0.5, 0.7]),
                         'n_estimators': array([ 50, 150, 250]),
                         'random_state': [42]},
             scoring='neg_root_mean_squared_error')

In [24]:
mean_squared_error(y_test, md.predict(X_test_sc), squared = False)

15.792019632780944

Теперь попробуем обучить модель без средних оценок

In [25]:
X_n = X.drop(['avg'], axis=1)


In [26]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n, y, test_size = 0.2, random_state=42)

In [27]:
X_train_sc_n = X_train_n.copy()
X_test_sc_n = X_test_n.copy()
for i in X_train_n.columns:
    X_train_sc_n[i] = (X_train_sc_n[i] - X_train_sc_n[i].mean())/X_train_sc_n[i].std()
    X_test_sc_n[i] = (X_test_sc_n[i] - X_test_sc_n[i].mean())/X_test_sc_n[i].std()

In [28]:
md_n = GridSearchCV(estimator = RandomForestRegressor(), param_grid = {'n_estimators': np.arange(50, 301, 100), 'max_depth': np.arange(5, 20, 6), 'max_features': np.arange(0.1, 0.8, 0.2),'random_state': [42]}, refit = True, scoring = 'neg_root_mean_squared_error', n_jobs =-1, cv = 5)
md_n.fit(X_train_sc_n, y_train_n)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': array([ 5, 11, 17]),
                         'max_features': array([0.1, 0.3, 0.5, 0.7]),
                         'n_estimators': array([ 50, 150, 250]),
                         'random_state': [42]},
             scoring='neg_root_mean_squared_error')

In [29]:
mean_squared_error(y_test_n, md_n.predict(X_test_sc_n), squared = False)

18.19030996386363

Теперь попробуем CountVectorizer:

In [30]:
np.array(df1['Review_Text'])

array(['excellent room and exercise facility. all around atmosphere was calm and comfortable. main dining room offers both excellent food and service. avoid flying elephant restaurant, stick to the main dining room.',
       'very comfortable and felt safe. \r\nstaff were very helpful and respectful. breakfast offered a wide choice which cartered for every palate, started early and finished late.',
       'not worth the rating shown. service is not good. room not well maintained. room are spacious. illumination of room was very poor. tv remote not working. carry good mosquito repalent. location wise it is excellent who want to stay in near central chennai.',
       ...,
       'awesome i liked the neatness and maintenance. facilities reasonable prices. overall it is good happy to visit samudra.. sure if i have chance i will check in to samudra again for giving comfortable accommodation.',
       'the overall experience was good. however the wi-fi was getting disconnected often. we foun

In [31]:
vectorizer = CountVectorizer()
Xi = vectorizer.fit_transform(np.array(df1['rev_conv']))
print(vectorizer.vocabulary_)



In [32]:
x = Xi.todense()
x.shape

(2351, 4812)

In [33]:
Xn = df.drop(['Hotel_name', 'Review_Text', 'Review_Title'], axis = 1)
Xn[list(range(x.shape[1]))] = x

  self[col] = igetitem(value, i)


In [34]:
Xi

<2351x4812 sparse matrix of type '<class 'numpy.int64'>'
	with 44379 stored elements in Compressed Sparse Row format>

Не вижу смысла обучать модель на таких данных, т.к. уже фичей больше, чем объектов. Возможно, стоит убрать все векторы, обозначающие "неважные" слова (которые не отображают настроение рецензии), и редко встречающиеся слова, но тогда это мало чем будет отличаться от моих действий выше (где я выборочно брал за фичи самые частые слова, которые чётко характеризуют отметку рецензии)

Далее я буду обучать Tfidf так, как мне показалось верным сначала

In [35]:
idf_vect = TfidfVectorizer()
Z = idf_vect.fit_transform(np.array(df1['rev_conv']))
z = Z.todense()

In [36]:
z.shape

(2351, 4812)

In [37]:
threshold = float(np.median(z.mean(axis=0), axis=1))

In [38]:
Xz = df.drop(['Hotel_name', 'Review_Text', 'Review_Title'], axis = 1)
Xz[list(range(z.shape[1]))] = z

  self[col] = igetitem(value, i)


In [39]:
yz = Xz['Rating']
Xz = Xz.drop(['Rating'], axis=1)
del_list = []
for i in Xz.columns:
  if Xz[i].mean() < threshold:
    del_list.append(i)
Xz = Xz.drop(del_list, axis=1)


In [40]:
while Xz.shape[1] > 50:
  threshold = float(np.median(Xz.mean(axis=0)))
  del_list = []
  for i in Xz.columns:
    if Xz[i].mean() < threshold:
      del_list.append(i)
  Xz = Xz.drop(del_list, axis=1)

In [41]:
Xz

Unnamed: 0,247,277,688,743,876,910,924,988,1619,1650,...,3649,3694,3695,3833,4075,4104,4352,4572,4696,4779
0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.132633,0.240659,0.000000,...,0.132205,0.225155,0.0,0.084445,0.000000,0.000000,0.000000,0.0,0.000000,0.0
1,0.0,0.0,0.113465,0.0,0.000000,0.0,0.0,0.152343,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.092602,0.000000,0.000000,0.0,0.000000,0.0
2,0.0,0.0,0.000000,0.0,0.124436,0.0,0.0,0.000000,0.126480,0.000000,...,0.000000,0.236664,0.0,0.088762,0.000000,0.083351,0.000000,0.0,0.136162,0.0
3,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.092498,0.0,0.000000,0.099363,0.097731,0.297746,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.176912,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.060066,0.0,0.067584,0.064523,0.000000,0.096674,0.0,0.000000,0.0
2347,0.0,0.0,0.000000,0.0,0.085100,0.0,0.0,0.000000,0.000000,0.167745,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
2348,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.137203,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
2349,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.199332,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0


In [42]:
X_tr_1, X_tt_1, y_tr_1, y_tt_1 = train_test_split(Xz, yz, test_size = 0.2, random_state=42)

In [43]:
X_tr_1_ = X_tr_1.copy()
X_tt_1_ = X_tt_1.copy()
for i in Xz.columns:
  X_tr_1_[i] = (X_tr_1_[i] - X_tr_1_[i].mean())/X_tr_1_[i].std()
  X_tt_1_[i] = (X_tt_1_[i] - X_tt_1_[i].mean())/X_tt_1_[i].std()

In [44]:
md_1 = GridSearchCV(estimator = RandomForestRegressor(), param_grid = {'n_estimators': np.arange(50, 301, 100), 'max_depth': np.arange(5, 20, 6), 'max_features': np.arange(0.1, 0.8, 0.2),'random_state': [42]}, refit = True, scoring = 'neg_root_mean_squared_error', n_jobs =-1, cv = 5)
md_1.fit(X_tr_1_, y_tr_1)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': array([ 5, 11, 17]),
                         'max_features': array([0.1, 0.3, 0.5, 0.7]),
                         'n_estimators': array([ 50, 150, 250]),
                         'random_state': [42]},
             scoring='neg_root_mean_squared_error')

In [45]:
mean_squared_error(y_tt_1, md_1.predict(X_tt_1_), squared = False)

17.220823916434384

Теперь обучим так, как в лекции:

In [46]:
source = df1['Hotel_name'].replace(np.nan, '') + ' ' + df1['Review_Title'].replace(np.nan, '') + ' ' + df1['rev_conv'].replace(np.nan, '')

In [47]:
s_tr, s_tt, t_tr, t_tt = train_test_split(source, df1['Rating'], test_size = 0.2, random_state=42)

In [48]:
c_v = TfidfVectorizer(ngram_range=(5, 7), analyzer='char_wb')
X_tr_c = c_v.fit_transform(s_tr.values)
X_tr_c.shape
X_tt_c = c_v.transform(s_tt.values)
X_tt_c.shape
svm = SGDRegressor()
svm.fit(X_tr_c, t_tr)
predicted = svm.predict(X_tt_c)
print(mean_squared_error(t_tt, predicted, squared=False))

15.675674985469925




In [49]:
c_v = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
X_tr_c = c_v.fit_transform(s_tr.values)
X_tr_c.shape
X_tt_c = c_v.transform(s_tt.values)
X_tt_c.shape
svm = SGDRegressor()
svm.fit(X_tr_c, t_tr)
predicted = svm.predict(X_tt_c)
print(mean_squared_error(t_tt, predicted, squared=False))

18.662116544666034




Теперь пробуем word2vec:

In [50]:
import io
import numpy as np

from tqdm import tqdm
from itertools import islice


def load_vectors(fname, limit):
    fin = io.open(fname, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(islice(fin, limit), total = limit):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [51]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -O crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip

--2022-05-20 12:05:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2022-05-20 12:06:26 (24.1 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [52]:
v = load_vectors('crawl-300d-2M.vec', 1000000)

100%|██████████| 1000000/1000000 [01:36<00:00, 10371.55it/s]


In [53]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

In [54]:
def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [55]:
c, e = 0, 0
matrix = np.zeros([source.shape[0], 300])
for i in range(source.shape[0]):
  for j in my_lemmatizer(source[i]).split():
    try:
      matrix[i] += v[j]
      e += 1
    except KeyError:
      c += 1
c, e

(1397, 64583)

In [56]:
fin_tr, fin_tt, g_tr, g_tt = train_test_split(matrix, df1['Rating'], test_size=0.2)

In [87]:
sc = StandardScaler()
fin_tr = sc.fit_transform(fin_tr)
fin_tt = sc.transform(fin_tt)

In [92]:
svm1 = SGDRegressor()
svm1.fit(fin_tr, g_tr)
predicted1 = svm1.predict(fin_tt)
print(mean_squared_error(g_tt, predicted1, squared=False))

19.75333726204126


In [93]:
m = LinearRegression()
m.fit(fin_tr, g_tr)
pred = m.predict(fin_tt)
print(mean_squared_error(g_tt, pred, squared=False))

16.3412704976295


In [96]:
df_t = pd.read_csv('https://raw.githubusercontent.com/kryzhikov/Sample-ML-Repo/main/11.%20Texts/nlp/test.csv', encoding = 'unicode_escape')

In [97]:
df_t['Id']

0       2351
1       2352
2       2353
3       2354
4       2355
        ... 
2348    4698
2349    4699
2350    4700
2351    4701
2352    4702
Name: Id, Length: 2353, dtype: object

In [98]:
df_t = df_t.drop(['Id'], axis = 1)
df_t

Unnamed: 0,Hotel_name,Review_Title,Review_Text
0,ITC Grand Chola,Mr Neeraj,On the night of my arrival from NY I had a min...
1,Hotel Pandian,,Not so great. But it is still acceptable. Bit...
2,Oyo Rooms Guindy Olympia Tech Park,Nice stay for corporate people,Been a good place to stay for people who visit...
3,OYO Apartments Saidapet,Average hotel,Not worth of the money we paid.worst ac.no wat...
4,Ramada Chennai Egmore,A good mid range corporate hotel,"A well located hotel, with decent sized rooms ..."
...,...,...,...
2348,Lemon Tree Chennai,Average stay,"Compared to other lemon tree stay, this was bi..."
2349,Oyo Rooms T Nagar Off Pondy Bazaar,location is not good. rude behavior. staff no ...,unpleasant stay. not easy task to reach. Morni...
2350,VGP Golden Beach Resort,,Quality of service is too bad. We arrived 12.3...
2351,The Park Chennai,Over rated and overpriced Hotel,I am not sure why someone wants to spend that ...


In [99]:
source1 = df_t['Hotel_name'].replace(np.nan, '') + ' ' + df_t['Review_Title'].replace(np.nan, '') + ' ' + df_t['Review_Text'].replace(np.nan, '')
for i in range(source1.shape[0]):
  source1[i] = source1[i].lower()

In [100]:
c, e = 0, 0
matrix1 = np.zeros([source1.shape[0], 300])
for i in range(source1.shape[0]):
  for j in my_lemmatizer(source1[i]).split():
    try:
      matrix1[i] += v[j]
      e += 1
    except KeyError:
      c += 1
c, e

(5615, 99572)

In [101]:
matrix1

array([[-4.275 , -1.0689, -0.6302, ...,  1.6204,  0.7672,  3.3661],
       [-1.3155, -0.3654, -0.4255, ..., -1.1702, -1.3034,  0.8656],
       [-3.8134, -5.833 ,  1.3001, ..., -5.7953, -3.7168, -1.1358],
       ...,
       [ 0.2748, -4.6076, -0.4022, ..., -0.8686, -0.8082,  1.7301],
       [-1.7043, -3.4966,  0.9941, ..., -2.5968, -0.5001,  1.3433],
       [-3.3892, -2.7858,  1.4604, ..., -2.8917, -0.1583,  0.2306]])

In [102]:
sc1 = StandardScaler()
matrix1_sc = sc1.fit_transform(matrix1)

In [103]:
pred_1 = m.predict(matrix1_sc)

In [109]:
targ = pd.read_csv('https://raw.githubusercontent.com/kryzhikov/Sample-ML-Repo/main/11.%20Texts/nlp/sample%20submission.csv', encoding='unicode_escape')

In [114]:
targ = targ["Rating"]
targ[2352] = 73.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [115]:
targ.shape

(2353,)

In [116]:
mean_squared_error(targ, pred_1, squared=False)

16.18145093691868