In [None]:
!pip install numpy pandas
!pip install sklearn

In [None]:
import re
import string
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

yelp_df = pd.read_csv("yelp.csv")

# CLEAN TEXT

def clean(text):
  text = text.lower()
  text = re.sub(r'\d+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = re.sub(r'\W', ' ', text)
  return text

yelp_df['text'] = yelp_df['text'].apply(clean)

# tokenization
yelp_df['text'] = yelp_df['text'].apply(word_tokenize)

# stopwords

stop_words = set(stopwords.words('english'))

def remove_stop(text):
  return [word for word in text if word not in stop_words]

yelp_df['text'] = yelp_df['text'].apply(remove_stop)

# lemmatization

lemmatizer = WordNetLemmatizer()

def lem(text):
  return [lemmatizer.lemmatize(word) for word in text]

yelp_df['text'] = yelp_df['text'].apply(lem)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# change stars so that there are 3 categories

def change_stars(stars):
  if stars > 3:
    return 2
  elif stars < 3:
    return 0
  else:
    return 1


yelp_df["stars"] = yelp_df["stars"].apply(change_stars)

In [None]:
#SAMPLING
from sklearn.model_selection import train_test_split

X = yelp_df['text']
y = yelp_df['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 47, stratify = y)

In [None]:
# baseline model using logistic regression with TF-IDF features

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


tfidfvectorizer = TfidfVectorizer()

X_train_base = X_train.apply(lambda tokens: " ".join(tokens))
X_test_base = X_test.apply(lambda tokens: " ".join(tokens))

x_train_vector = tfidfvectorizer.fit_transform(X_train_base)
x_test_vector = tfidfvectorizer.transform(X_test_base)

logreg = LogisticRegression(random_state=47, solver="lbfgs")
logreg.fit(x_train_vector, y_train)

y_pred = logreg.predict(x_test_vector)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.53      0.64       335
           1       0.51      0.15      0.23       292
           2       0.79      0.98      0.88      1373

    accuracy                           0.78      2000
   macro avg       0.70      0.55      0.58      2000
weighted avg       0.75      0.78      0.74      2000



In [None]:
nyt_df = pd.read_csv("nyt.csv")

# clean text

nyt_df['text'] = nyt_df['text'].apply(clean)

# tokenization
nyt_df['text'] = nyt_df['text'].apply(word_tokenize)

# stopwords

stop_words = set(stopwords.words('english'))

nyt_df['text'] = nyt_df['text'].apply(remove_stop)

# lemmatization

lemmatizer = WordNetLemmatizer()

nyt_df['text'] = nyt_df['text'].apply(lem)

In [None]:
# use model on NYT data
X_nyt = nyt_df['text'].apply(lambda tokens: " ".join(tokens))
X_nyt_vector = tfidfvectorizer.transform(X_nyt)

y_test_nyt = nyt_df['stars'].values

y_pred_nyt = logreg.predict(X_nyt_vector)
print(classification_report(y_test_nyt, y_pred_nyt))



              precision    recall  f1-score   support

           0       0.75      0.30      0.43        10
           1       0.00      0.00      0.00        10
           2       0.38      1.00      0.56        10

    accuracy                           0.43        30
   macro avg       0.38      0.43      0.33        30
weighted avg       0.38      0.43      0.33        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
