# CSCI 544 HW1

In [1]:
import pandas as pd
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
import re
from bs4 import BeautifulSoup
import contractions
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Read Data

Read the data from the file using the read_table method from Pandas. Piazza mentions that a .csv file is available in the grader's environment but as that data is in a tsv format I have kept it as is.

In [2]:
review_data = pd.read_table('data.tsv', on_bad_lines='skip')

## Keep Reviews and Ratings

Select only the `star_rating` and `review_body` columns from the data. Also drop any rows that have an invalid value for star_rating. i.e. 1 <= `star_rating` <= 5 using a combination of coerce parameter on to_numeric and the dropna method.

In [4]:
review_data = review_data[['star_rating', 'review_body']]
review_data['star_rating'] = pd.to_numeric(review_data['star_rating'], errors='coerce')
review_data.dropna(inplace=True)

 ## We form two classes and select 50000 reviews randomly from each class.



In [5]:
positive_reviews = review_data[review_data['star_rating'] >= 4]
negative_reviews = review_data[review_data['star_rating'] < 4]

In [6]:
SEED = 42
NUM_ROWS = 50000
reviews_subset = pd.concat([positive_reviews.sample(NUM_ROWS, random_state=SEED), negative_reviews.sample(NUM_ROWS, random_state=SEED)])

# Data Cleaning



Apply cleaning by
- removing links using BeautifulSoup
- removing non-alphabetical characters
- performing contractions with the `contractions` library
- removing extra spaces
- converting all characters to lower case

In [7]:
def clean_review_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = contractions.fix(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

In [8]:
reviews_subset['cleaned_text'] = reviews_subset['review_body'].apply(clean_review_text)

In [9]:
avg_length_before_cleaning = reviews_subset['review_body'].apply(len).mean()
avg_length_after_cleaning = reviews_subset['cleaned_text'].apply(len).mean()
print(f'{avg_length_before_cleaning},{avg_length_after_cleaning}')

314.24925,299.44586


# Pre-processing

Preprocess the data by removing stop words and lemmatizing the words. I remove the stop words defined in the `stopwords` corpus and I have performed lemmatization using the `WordNetLemmatizer` from the `nltk` library.

In [10]:
def preprocess_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

In [11]:
reviews_subset['preprocessed_review'] = reviews_subset['cleaned_text'].apply(preprocess_text)

In [52]:
avg_length_before_preprocessing = reviews_subset['cleaned_text'].apply(len).mean()
avg_length_after_preprocessing = reviews_subset['preprocessed_review'].apply(len).mean()
print(f'{avg_length_before_preprocessing},{avg_length_after_preprocessing}')

299.44586,184.04841


# TF-IDF and BoW Feature Extraction

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_score, recall_score, f1_score

Our features are engineered using the `tf-idf` and `BoW` methods. Our target values are split into classes 1 (1 <= `star_rating` <= 3) and 2 (4 <= `star_rating` <= 5). Perform an 80-20 split of the data into training and test sets which gives us 80000 training samples and 20000 test samples.

In [54]:
x = reviews_subset['preprocessed_review']
y = (reviews_subset['star_rating'] >= 4).astype(int) + 1
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)

While performing feature engineering make sure to call fit_transform on the training data and transform on the test data to ensure that we prevent data leakage.

In [55]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [56]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Perceptron Using Both Features

In [57]:
perceptron_tfidf = Perceptron()
perceptron_bow = Perceptron()

perceptron_tfidf.fit(X_train_tfidf, y_train)
perceptron_bow.fit(X_train_bow, y_train)

y_pred_tfidf = perceptron_tfidf.predict(X_test_tfidf)
y_pred_bow = perceptron_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'{precision_bow} {recall_bow} {f1_bow}')
print(f'{precision_tfidf} {recall_tfidf} {f1_tfidf}')

0.7898182528825484 0.8111389864525841 0.8003366503292244
0.7778559228373355 0.7769192172604115 0.7773872878803092


# SVM Using Both Features

In [58]:
from sklearn.svm import LinearSVC

In [59]:
svm_model_tfidf = LinearSVC()
svm_model_bow = LinearSVC()

svm_model_tfidf.fit(X_train_tfidf, y_train)
svm_model_bow.fit(X_train_bow, y_train)

y_pred_tfidf = svm_model_tfidf.predict(X_test_tfidf)
y_pred_bow = svm_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'{precision_bow} {recall_bow} {f1_bow}')
print(f'{precision_tfidf} {recall_tfidf} {f1_tfidf}')

0.8408687480421844 0.8081284495735073 0.8241735748643946
0.8438438438438438 0.845960863020572 0.8449010273114508


# Logistic Regression Using Both Features

In [60]:
from sklearn.linear_model import LogisticRegression

I have provided a `max_iter` value of 1000 here to prevent the `ConvergenceWarning` from being thrown and to ensure that the model converges.

In [42]:
lr_model_tfidf = LogisticRegression(max_iter=1000)
lr_model_bow = LogisticRegression(max_iter=1000)

lr_model_tfidf.fit(X_train_tfidf, y_train)
lr_model_bow.fit(X_train_bow, y_train)

y_pred_tfidf = lr_model_tfidf.predict(X_test_tfidf)
y_pred_bow = lr_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'{precision_bow} {recall_bow} {f1_bow}')
print(f'{precision_tfidf} {recall_tfidf} {f1_tfidf}')

0.8332532897896455 0.8644743398106627 0.8485767387264012
0.8552511877084807 0.8431489785749875 0.8491569650742673


# Naive Bayes Using Both Features

In [61]:
from sklearn.naive_bayes import MultinomialNB

In [62]:
nb_model_tfidf = MultinomialNB()
nb_model_bow = MultinomialNB()

nb_model_tfidf.fit(X_train_tfidf, y_train)
nb_model_bow.fit(X_train_bow, y_train)

y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)
y_pred_bow = nb_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'{precision_bow} {recall_bow} {f1_bow}')
print(f'{precision_tfidf} {recall_tfidf} {f1_tfidf}')

0.837139689578714 0.7577521324636227 0.7954701079799842
0.8113885919735692 0.8379327646763672 0.8244470774091628
