In [1]:
import pandas as pd

In [2]:
corpus = pd.read_json('data.json')

In [3]:
corpus.drop(columns=['verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'summary', 'unixReviewTime', 'vote',
       'image'], inplace=True)

In [4]:
corpus = pd.DataFrame(corpus)

In [5]:
corpus.rename(columns={'reviewText':'review', 'overall':'sentiment'}, inplace=True)

In [6]:
from imblearn.under_sampling import RandomUnderSampler

X = corpus.drop('sentiment', axis=1)
y = corpus['sentiment']

sampling_strategy = {
    1: 2500,
    2: 2500,
    3: 2500,
    4: 2500,
    5: 2500
}

rus = RandomUnderSampler(
    sampling_strategy=sampling_strategy,
    random_state=0
)

X, y = rus.fit_resample(X, y)

corpus = pd.concat([X, y], axis=1)
corpus = corpus[corpus['sentiment'] != 3]
corpus = corpus.sample(frac=1, random_state=0)
corpus.reset_index(drop=True, inplace=True)
print(corpus.head())

                                              review  sentiment
0                                         Works well          5
1  It should be known that I attempted to use thi...          1
2  Doesnt fit dash cutout of my dodge properly - ...          1
3  Good value for the money.  Exactly as advertis...          4
4  The clip needed to install on my car snapped i...          1


In [7]:
import string
import nltk
import re
import pandas as pd
from bs4 import BeautifulSoup
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
stopwords_en = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [9]:
def label_sentiment(row):
    if int(row['sentiment']) <= 2:
        return 0
    if int(row['sentiment']) >= 4:
        return 1

In [10]:
corpus['sentiment_label'] = corpus.apply(lambda row: label_sentiment(row), axis=1)

In [11]:
def preprocessing(text):
  """
  It drops empty texts, NaN, HTML, URLs, numbers and special caracters.
  """

  if not text or text.lower() in ['none', 'nan']:
        return ""

  text = BeautifulSoup(text, "html5lib").get_text()

  text = re.sub(r'https?://\S+|www\.\S+', '', text)

  text = re.sub(r"[^a-zA-Z]", " ", text.lower())

  text = re.sub(r'\d+', '', text)

  return text

In [12]:
def tokenize_lemmatize(text):
    """
    Tokenize, drops stopwords and lemamatize.
    """

    tokens = nltk.word_tokenize(text)

    tokens = [word for word in tokens if word not in stopwords_en]

    lemas = []
    for word, tag in nltk.pos_tag(tokens):

        if tag.startswith('J'):
            pos = 'a'
        elif tag.startswith('V'):
            pos = 'v'
        elif tag.startswith('N'):
            pos = 'n'
        elif tag.startswith('R'):
            pos = 'r'
        else:
            pos = 'n'

        lemas.append(lemmatizer.lemmatize(word, pos=pos))

    return " ".join(lemas)

In [13]:
def preprocessing_main(review_text):
    """
    Main function of the full pipeline.
    """
    text_processed = preprocessing(review_text)
    text_final = tokenize_lemmatize(text_processed)

    return text_final

In [14]:
corpus_mod = corpus.copy()
corpus_mod['review_processed'] = corpus_mod['review'].apply(preprocessing_main)

In [17]:
print('Original review: {}'.format(corpus_mod['review'][1]))
print('Processed review: {}'.format(corpus_mod["review_processed"][1]))

Original review: It should be known that I attempted to use this on a Honda Civic (D16Y7) The off level jaws made it impossible to grab valve springs. Apparently they are that way to make it easier to grab into springs, but it didn't work for me.  Also, as the tool is tightened to compress the spring, the slider (silver in the picture), moves down , spreading the jaws apart, releasing the spring. All in all, I tried this and immediately sent it back. I had better luck with a 3/8" box end wrench and pushing the springs into place by hand. Not something I advise, but that worked while this tool didn't.
Processed review: know attempted use honda civic level jaw make impossible grab valve spring apparently way make easy grab spring work also tool tighten compress spring slider silver picture move spread jaw apart release spring try immediately send back well luck box end wrench push spring place hand something advise work tool


In [16]:
corpus_mod.to_json("corpus_mod.json")