## Imports

In [14]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.linear_model import LinearRegression

## Load data

In [15]:
in_path = './tripadvisor-review-prediction/train/in.tsv'
expected_path = './tripadvisor-review-prediction/train/expected.tsv'
df = pd.read_csv(in_path, sep='\t',  header=None, names=['review'])
df['rate'] = pd.read_csv(expected_path, sep='\t',  header=None)
print(df)

                                                  review  rate
0      Stayed here on business trips and the hotel is...     5
1      Spent two nights here for a wedding in Brookly...     5
2      Great place for a 3-night stay. Our king room ...     5
3      This is my favorite hotel in Chicago, and I've...     5
4      BEST. BREAKFAST. EVER. Couldn't have been happ...     5
...                                                  ...   ...
19995  Do not stay at this hotel. It is unclean, loud...     1
19996  I stayed here for 4 days recently and it wasnt...     1
19997  We arrived at 10pm to check in to the hotel. A...     1
19998  I agree with the other reviews that mentioned ...     1
19999  I go to NYC at least 5 times a year and have s...     1

[20000 rows x 2 columns]


## Preprocess data

In [16]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s!?]','',text)
    text = text.lower()
    words = word_tokenize(text) 
    filtered_words = [word for word in words if word not in stop_words] # remove stopwords
    stemmed_words = [stemmer.stem(word) for word in filtered_words] # stemming
    text = ' '.join(stemmed_words) # join back to string
    return text


df['processed_review'] = df['review'].apply(preprocess_text)
print(df['processed_review'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        stay busi trip hotel excel busi trip staff exc...
1        spent two night wed brooklyn staff member exce...
2        great place night stay king room overlook pool...
3        favorit hotel chicago ive stay top luxuri prop...
4        best breakfast ever couldnt happier food staff...
                               ...                        
19995    stay hotel unclean loud threaten atmospher wou...
19996    stay day recent wasnt pleasant stay let tell i...
19997    arriv pm check hotel check said go room plenti...
19998    agre review mention wet dog odor antiqu heat c...
19999    go nyc least time year stay variou hilton prop...
Name: processed_review, Length: 20000, dtype: object


### TF-IDF encoding and X_train, y_train assignment

In [17]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000, min_df=3)

X_train = tfidf_vectorizer.fit_transform(df['processed_review']).toarray()
print(X_train.shape)

y_train = np.array(df['rate'])
print(y_train.shape)

(20000, 2000)
(20000,)


## Load and train model

In [18]:
model = LinearRegression()
model.fit(X_train, y_train)

## Load and prepare test data

In [19]:
in_test_path = './tripadvisor-review-prediction/test/in.tsv'
df_test = pd.read_csv(in_test_path, sep='\t',  header=None, names=['review'])

df_test['processed_review'] = df_test['review'].apply(preprocess_text)

X_test = tfidf_vectorizer.transform(df_test['processed_review']).toarray()
print(X_test.shape)

(20000, 2000)


## Predict and save to file

In [21]:
y_pred = model.predict(X_test)

y_pred_df = pd.DataFrame(y_pred, columns=['predictions'])

y_pred_df.to_csv('out.tsv', sep='\t', index=False,  header=False)

In [22]:
y_pred

array([4.54866737, 1.75016059, 1.05668085, ..., 3.0814122 , 3.0873582 ,
       3.92879142])