###Import Libraries and set directory



In [411]:
import pandas as pd
import os
import gzip
import re

In [412]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Load data source (reviews from Amazon)

Data source: *https://jmcauley.ucsd.edu/data/amazon/*

Choosing work directory

In [413]:
data_samples_dir = '/content/drive/MyDrive/TCC_Leonardo_Zago/data_samples'

Define function to clean text from the reviews

In [414]:
def clean_text(text):
  text = text.replace('\n', ' ').replace('\t', ' ').replace('  ', ' ').replace('"', '')
  text = re.sub(r'[^a-zA-Z0-9.!?,; ]+', '', text)
  text = re.sub(r'[\.!?;]+', '.', text)
  text = re.sub(r'[\,]+', ',', text)
  text = text.replace('.', '. ').replace(',', ', ')
  text = re.sub(r'\s+', ' ', text)

  while text[0] in (['.', ',', ' ']):
    text = text[1:]

  text = text.replace(' .', '.')
  text = re.sub(r'\.+', '.', text)
  text = text.replace(' ,', ',')
  text = re.sub(r'\,+', ',', text)

  if text[-1] not in ['.', ' ']:
      text = text + '.'
  elif text[-1] == ' ' and text[-2] != '.':
    text = text[:-1] + '.'
  elif text[-1] == ' ' and text[-2] == '.':
    text = text[:-1]
  elif text[-2:] == ' .':
    text = text[:-2] + '.'

  text = text.replace(',.', '.').replace('.,', '.')
  text = re.sub(r'\.+', '.', text)

  text_sentences = [sentence.capitalize() for sentence in text.split('. ')]
  
  text = '. '.join(text_sentences)

  
  return text

Import data from the reviews

In [415]:
category = 'Video_Games'

with gzip.open(os.path.join(data_samples_dir, 'Video_Games_5.json.gz'), 'rb') as gz:
  reviews_df = pd.read_json(gz, lines=True)

reviews_df = reviews_df.dropna(subset=['overall', 'reviewText'])
reviews_df = reviews_df.drop_duplicates(subset=['overall', 'reviewText'])

# Selecting only negative reviews (overall rating 1 and 5) between 2 and 40 words
filtered_reviews_df = reviews_df.loc[(reviews_df['reviewText'].str.split().apply(len) > 2) & (reviews_df['reviewText'].str.split().apply(len) < 40)]
filtered_reviews_df = pd.concat([filtered_reviews_df.loc[filtered_reviews_df['overall'] == 5].sample(400), filtered_reviews_df.loc[filtered_reviews_df['overall'] == 1].sample(400)])

# Merging all reviews from a category and saving data into .txt files in directory
filtered_reviews_df['reviewText'] = filtered_reviews_df.apply(lambda x: clean_text(x['reviewText']), axis=1)

merged_text = filtered_reviews_df['reviewText'].str.cat(sep='\n')
with open('{}/unstructured_reviews/review_{}.txt'.format(data_samples_dir, category), 'w', encoding='utf-8') as f:
  f.write(merged_text)

The next step is to input the cleaned reviews in the KGen model, found on https://github.com/rossanez/kgen