In [None]:
# Mounting google drive to use files from drive
# If you aren't using google colab just delete this
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import gzip
import json
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Converting json.gz to dataframe.
# Code taken from the site the data is from
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('./drive/MyDrive/Magazine_Subscriptions_5.json.gz')  # Change to file location

# Exploring the DataFrame

In [None]:
# Showing the dataframe before any cleaning/transformation
display(df)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,4.0,True,"02 26, 2014",A5QQOOZJOVPSF,B00005N7P0,John L. Mehlmauer,"I'm old, and so is my computer. Any advice th...",Cheapskates guide,1393372800,,,
1,5.0,False,"03 6, 2004",A5RHZE7B8SV5Q,B00005N7PS,gorillazfan249,"There's nothing to say, but if you want a REAL...",The best mature Men's magazine.,1078531200,3,,
2,1.0,False,"07 15, 2003",A1RPTVW5VEOSI,B00005N7PS,Michael J. Edelman,If you're the kind of man who looks at himself...,THE Magazine for the Self-Centered Male,1058227200,17,,
3,1.0,True,"01 31, 2015",A1SFRBCMW8XVBW,B00005N7PS,Hoyett L. Barnett,Nothing to it. Just an advertisement. Little...,Nothing to it. Just an advertisement. Little a...,1422662400,,,
4,5.0,True,"10 5, 2010",A1IU9VPCBKZPE8,B00005N7P0,Randolph Eck,When PC Magazine ceased publication of their p...,Excellent Computer Magazine,1286236800,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2370,1.0,True,"02 7, 2018",A18X3E6V8DGIDZ,B00X6LREJU,gatormum64,Not what I expected. Found it boring and lacki...,Cancelled subscripton,1517961600,,,
2371,4.0,True,"03 3, 2017",A1Y98LVYJ0YZJ0,B00X6LREHM,Michele,I have been a Family Circle readers for years ...,You can find a good variety of articles,1488499200,,,
2372,5.0,True,"01 15, 2017",A1VTKYW3YQEHTN,B00X6LREHM,Linda,"Even though I have only received 3 issues, I r...",I really enjoy reading Family Circle,1484438400,,,
2373,5.0,True,"05 30, 2017",ASU7EOBD3Y4BV,B01HI8V10E,Miss Betty,Great magazine at a great price through Amazon!,Great magazine at a great price through Amazon!,1496102400,,,


In [None]:
# Dropping the unnecessary columns
drop_col = [col for col in df.columns if col not in ['overall', 'verified', 'reviewText', 'summary']] # Getting all column names that are unneeded
df = df.drop(drop_col, axis=1) # Dropping them from our df

In [None]:
# Checking for any null values
df.isna().sum()

overall       0
verified      0
reviewText    1
summary       2
dtype: int64

In [None]:
# Dropping the null values in the reviewText column
df = df.dropna(subset=['reviewText'])

# If summary is used, drop nulls in this cell

In [None]:
# Checking the distribution of ratings
df['overall'].value_counts()

5.0    1540
4.0     375
3.0     239
2.0     118
1.0     102
Name: overall, dtype: int64

In [None]:
# Creating a sentiment column by converting the numeric rating to postive, neutral, negative
conditions = [
    (df['overall']<=2.0),
    (df['overall']==3.0),
    (df['overall']>=4.0)
]
vals = ['negative', 'neutral', 'positive']
df['sentiment'] = np.select(conditions, vals)
display(df)

Unnamed: 0,overall,verified,reviewText,summary,sentiment
0,4.0,True,"I'm old, and so is my computer. Any advice th...",Cheapskates guide,positive
1,5.0,False,"There's nothing to say, but if you want a REAL...",The best mature Men's magazine.,positive
2,1.0,False,If you're the kind of man who looks at himself...,THE Magazine for the Self-Centered Male,negative
3,1.0,True,Nothing to it. Just an advertisement. Little...,Nothing to it. Just an advertisement. Little a...,negative
4,5.0,True,When PC Magazine ceased publication of their p...,Excellent Computer Magazine,positive
...,...,...,...,...,...
2370,1.0,True,Not what I expected. Found it boring and lacki...,Cancelled subscripton,negative
2371,4.0,True,I have been a Family Circle readers for years ...,You can find a good variety of articles,positive
2372,5.0,True,"Even though I have only received 3 issues, I r...",I really enjoy reading Family Circle,positive
2373,5.0,True,Great magazine at a great price through Amazon!,Great magazine at a great price through Amazon!,positive


# Train Test Split + Preprocessing sentences



In [None]:
# Baseline - No preprocessing
x_train, x_test, y_train, y_test = train_test_split(df['reviewText'], df['sentiment'], test_size=0.3, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1661,) (1661,) (713,) (713,)


In [None]:
# Preprocessing Sentences
# Used for tests to see if preprocessing, stemming, and lemmatization have a major effect on prediction output

STOP_WORDS = set(stopwords.words('english'))

def preprocess_text(sentence, stop, type_proc=None):  # Maybe set default type to None; maybe set default to stop
  """
  Applied to the dataframe to preprocess reviewText
  Parameters:
    stop: Set of stopwords
    type: Defines if the sentence is only preprocessed, or includes stemming, or lemmatization
    - Acceptable inputs: "None", "stem", "lem"
  """
  words = []
  for word in sentence.lower().strip().split():  # Could also just use a tokenizer here

    # Could add more filtering
    word = re.sub('\d', '', word)        # Removes digits from the word
    word = re.sub('[^\w\s]', '', word)   # Removes non-characters from the word

    if word not in stop and word != '':  # Check if the word is in the stop words and should be excluded (don't know if the word != '' is needed)
      words.append(preprocess_type(word, type_proc))

  return ' '.join(words)                 # Converting back into a sentence

def preprocess_type(word, type_proc):
  """
  Helper function for preprocess_text
  Depending on the type: None, stem, or lem
  Returns the word, a stemmed word, or a lemmatized word
  """
  match type_proc:
    case None:
      return word
    case "stem":
      return PorterStemmer().stem(word)
    case "lem":
      return WordNetLemmatizer().lemmatize(word)


# Applying the preprocessing
# Don't have to do this on a dataframe, I just did it to keep it consistent with the sentiments but I think it doesn't matter
# test = df.reviewText.apply(lambda x: preprocess_text(x, STOP_WORDS, 'stem')) should just put it into a list
# Since I made a copy, the order should be the same, so I'm being lazy and using the same y_train and y_test for everything
# Probably could have also just made a list and apply the process to the x_train, then vectorize all of them at once

# Applying each preprocessing to x_train and x_test
x_train_proc = list(map(lambda x: preprocess_text(x, STOP_WORDS), x_train))
x_test_proc = list(map(lambda x: preprocess_text(x, STOP_WORDS), x_test))

x_train_stem = list(map(lambda x: preprocess_text(x, STOP_WORDS, "stem"), x_train))
x_test_stem = list(map(lambda x: preprocess_text(x, STOP_WORDS, "stem"), x_test))

x_train_lem = list(map(lambda x: preprocess_text(x, STOP_WORDS, "lem"), x_train))
x_test_lem = list(map(lambda x: preprocess_text(x, STOP_WORDS, "lem"), x_test))

# CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)

x_train_proc_bow = vectorizer.fit_transform(x_train_proc)
x_test_proc_bow = vectorizer.transform(x_test_proc)

x_train_stem_bow = vectorizer.fit_transform(x_train_stem)
x_test_stem_bow = vectorizer.transform(x_test_stem)

x_train_lem_bow = vectorizer.fit_transform(x_train_lem)
x_test_lem_bow = vectorizer.transform(x_test_lem)

### Apply tfidf vectorizer (Term Frequency - Inverse Document Frequency)
tf_vector = TfidfVectorizer()
x_train_tf = tf_vector.fit_transform(x_train)
x_test_tf = tf_vector.transform(x_test)

x_train_proc_tf = tf_vector.fit_transform(x_train_proc)
x_test_proc_tf = tf_vector.transform(x_test_proc)

x_train_stem_tf = tf_vector.fit_transform(x_train_stem)
x_test_stem_tf = tf_vector.transform(x_test_stem)

x_train_lem_tf = tf_vector.fit_transform(x_train_lem)
x_test_lem_tf = tf_vector.transform(x_test_lem)

"\ndf_proc = df.copy()\ndf_proc['reviewText'] = df_proc.reviewText.apply(lambda x: preprocess_text(x, STOP_WORDS))\nx_train_proc, x_test_proc, _, _ = train_test_split(df_proc['reviewText'], df_proc['sentiment'], test_size=0.3, random_state=42)\nx_train_proc = vectorizer.fit_transform(x_train_proc)\nx_test_proc = vectorizer.transform(x_test_proc)\n\ndf_stem = df.copy()\ndf_stem['reviewText'] = df_stem.reviewText.apply(lambda x: preprocess_text(x, STOP_WORDS, 'stem'))\nx_train_stem, x_test_stem, _, _ = train_test_split(df_stem['reviewText'], df_stem['sentiment'], test_size=0.3, random_state=42)\nx_train_stem = vectorizer.fit_transform(x_train_stem)\nx_test_stem = vectorizer.transform(x_test_stem)\n\ndf_lem = df.copy()\ndf_lem['reviewText'] = df_lem.reviewText.apply(lambda x: preprocess_text(x, STOP_WORDS, 'lem'))\nx_train_lem, x_test_lem, _, _ = train_test_split(df_lem['reviewText'], df_lem['sentiment'], test_size=0.3, random_state=42)\nx_train_lem = vectorizer.fit_transform(x_train_lem)

# Models

In [None]:
print("KNN")

############ Count Vectorizer tests ############
print()
print("CountVectorized words")
# Baseline
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_bow, y_train)
pred = neighbors.predict(x_test_bow)
print('Base Accuracy: ', accuracy_score(y_test, pred))

# Preprocess
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_proc_bow, y_train)
pred = neighbors.predict(x_test_proc_bow)
print('Preprocessed Accuracy: ', accuracy_score(y_test, pred))

# Stemmed
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_stem_bow, y_train)
pred = neighbors.predict(x_test_stem_bow)
print('Stemmer Accuracy: ', accuracy_score(y_test, pred))

# Lemmatized
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_lem_bow, y_train)
pred = neighbors.predict(x_test_lem_bow)
print('Lemmatizer Accuracy: ', accuracy_score(y_test, pred))

############ Tfidf Vectorizer tests ############
print()
print("Tfidf words")

neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_tf, y_train)
pred = neighbors.predict(x_test_tf)
print('Base Accuracy: ', accuracy_score(y_test, pred))

# Preprocess
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_proc_tf, y_train)
pred = neighbors.predict(x_test_proc_tf)
print('Preprocessed Accuracy: ', accuracy_score(y_test, pred))

# Stemmed
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_stem_tf, y_train)
pred = neighbors.predict(x_test_stem_tf)
print('Stemmer Accuracy: ', accuracy_score(y_test, pred))

# Lemmatized
neighbors = KNeighborsClassifier(n_neighbors=10, metric='minkowski')
neighbors.fit(x_train_lem_tf, y_train)
pred = neighbors.predict(x_test_lem_tf)
print('Lemmatizer Accuracy: ', accuracy_score(y_test, pred))

KNN

CountVectorized words
Base Accuracy:  0.7896213183730715
Preprocessed Accuracy:  0.7938288920056101
Stemmer Accuracy:  0.7994389901823282
Lemmatizer Accuracy:  0.7952314165497896

Tfidf words
Base Accuracy:  0.7952314165497896
Preprocessed Accuracy:  0.7952314165497896
Stemmer Accuracy:  0.7966339410939691
Lemmatizer Accuracy:  0.7952314165497896


In [None]:
print("Logistic Regression")
print()

############ Count Vectorizer tests ############

print("CountVectorized words")
# Baseline
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_bow, y_train)
log_sent = log_model.predict(x_test_bow)
print("Base Accuracy: ", accuracy_score(y_test, log_sent))

# Preprocess
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_proc_bow, y_train)
log_sent = log_model.predict(x_test_proc_bow)
print("Preprocess Accuracy: ", accuracy_score(y_test, log_sent))

# Stem
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_stem_bow, y_train)
log_sent = log_model.predict(x_test_stem_bow)
print("Stem Accuracy: ", accuracy_score(y_test, log_sent))

# Lemmatize
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_lem_bow, y_train)
log_sent = log_model.predict(x_test_lem_bow)
print("Lem Accuracy: ", accuracy_score(y_test, log_sent))

############ Tfidf Vectorizer tests ############
print()
print("Tfidf words")

# Baseline
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_tf, y_train)
log_sent = log_model.predict(x_test_tf)
print("Base Accuracy: ", accuracy_score(y_test, log_sent))

# Preprocess
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_proc_tf, y_train)
log_sent = log_model.predict(x_test_proc_tf)
print("Preprocess Accuracy: ", accuracy_score(y_test, log_sent))

# Stem
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_stem_tf, y_train)
log_sent = log_model.predict(x_test_stem_tf)
print("Stem Accuracy: ", accuracy_score(y_test, log_sent))

# Lemmatize
log_model = LogisticRegression(max_iter = 500)
log_model.fit(x_train_lem_tf, y_train)
log_sent = log_model.predict(x_test_lem_tf)
print("Lem Accuracy: ", accuracy_score(y_test, log_sent))


Logistic Regression

CountVectorized words
Base Accuracy:  0.85273492286115
Preprocess Accuracy:  0.8330995792426368
Stem Accuracy:  0.8316970546984572
Lem Accuracy:  0.8274894810659187

Tfidf words
Base Accuracy:  0.8218793828892006
Preprocess Accuracy:  0.8106591865357644
Stem Accuracy:  0.8064516129032258
Lem Accuracy:  0.8092566619915849
