In [10]:
!pip install spaczz
!pip install apyori
!pip install mlxtend
!pip install textblob


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m7.8 MB/s[

In [4]:
import os
import re
import spacy
import copy
import pandas as pd
from spaczz.matcher import FuzzyMatcher
import nltk
import string
from mlxtend.frequent_patterns import apriori, association_rules

# 1. Extract Relevant Information

## Crawl Reviews
Here we are just going to ingest the data rawly but put it into a structure where we know which product the review belongs to

In [5]:
def read_all_files_in_folder(directory):
    dataframes = []
    for root, directories, files in os.walk(directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            df = pd.read_table(file_path, on_bad_lines='skip', encoding='cp1252', header=None)
            product_name = filename.replace('.txt', '')
            df['product'] = product_name
            dataframes.append(df)
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    combined_df.columns = ['line', 'product'] 
    combined_df = combined_df[['product', 'line']]
    
    return combined_df

df_raw_product_lines = read_all_files_in_folder("./Data")
df_raw_product_lines.describe()

Unnamed: 0,product,line
count,10920,10920
unique,17,10531
top,Creative Labs Nomad Jukebox Zen Xtra 40GB,[t]
freq,1811,325


## Parse Reviews
Here we just want to parse the reviews from each of the products. Currently there are tags such as [t], feature[+/-] - these aren't needed

In [6]:
def parse_reviews(df):
    df['annotated_labels'] = df['line'].apply(lambda x: ' '.join(re.findall(r'(.*)##', x)))
    df['review'] = df['line'].apply(lambda x: ' '.join(re.findall(r'##(.*)', x)))
    df['review'] = df.apply(
        lambda row: ' '.join(re.findall(r'\[t\](.*)', row['line'])) if row['review'] == '' else row['review'],
        axis=1
    )
    return df

df_product_reviews = parse_reviews(df_raw_product_lines)
df_product_reviews[['product', 'line', 'annotated_labels', 'review']]

Unnamed: 0,product,line,annotated_labels,review
0,Nokia 6610,"[t]excellent phone , excellent service .",,"excellent phone , excellent service ."
1,Nokia 6610,##i am a business user who heavily depend on m...,,i am a business user who heavily depend on mob...
2,Nokia 6610,"phone[+3], work[+2]##there is much which has b...","phone[+3], work[+2]",there is much which has been said in other rev...
3,Nokia 6610,##just double check with customer service to e...,,just double check with customer service to ens...
4,Nokia 6610,at&t customer service[-2]##after several years...,at&t customer service[-2],after several years of torture in the hands of...
...,...,...,...,...
10915,Canon PowerShot SD500,automatic shutter[-3]##There were many times w...,automatic shutter[-3],There were many times when I clicked the autom...
10916,Canon PowerShot SD500,##It seems you have to double click some of th...,,It seems you have to double click some of the ...
10917,Canon PowerShot SD500,##Huh? I missed out on lots of photos and lots...,,Huh? I missed out on lots of photos and lots a...
10918,Canon PowerShot SD500,controls[-1]##The controls are hard to manipul...,controls[-1],"The controls are hard to manipulate, especiall..."


### Preprocess
There are a number of steps we need to take when preprocessing the data:
1. tokenize the reviews so that only wordsWe need to label the PoS of each of the sentences in order to be able to begin to prune down the reviews and start to comprehend which phrases are the features and which words are the opinions

In [7]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /Users/tomhill/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tomhill/nltk_data...


True

In [11]:
class Preprocess:
    def __init__(self):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        
    def main(self, reviews):
        tokenize_reviews = self.lower_tokenize_reviews(reviews)
        remove_punctuation_reviews = self.remove_punctuation_reviews(tokenize_reviews)
        remove_stopwords_reviews = self.remove_stopwords_reviews(remove_punctuation_reviews)
        pos_tag_reviews = self.pos_tag_reviews(remove_stopwords_reviews)
        lemmatize_reviews = self.lemmatize_reviews(pos_tag_reviews)

        return lemmatize_reviews
    
    def perform_fuzzy_matching(self, reviews):
        """
        This actually isn't fuzzy matching.... If we were to integrate it we'd need to do it against
        a list of already established words
        """
        return reviews.apply(lambda x: str(TextBlob(x).correct()))
        
    def lower_tokenize_reviews(self, reviews):
        return reviews.apply(lambda x: nltk.word_tokenize(x.lower()))
        
    def remove_punctuation_reviews(self, reviews):
        return reviews.apply(lambda tokens: [
            word.translate(str.maketrans('', '', string.punctuation))
            for word in tokens
        ])
                             
    def remove_stopwords_reviews(self, reviews):
        return reviews.apply(lambda tokens: [word for word in tokens if word not in self.stop_words])

    def pos_tag_reviews(self, reviews):
        return reviews.apply(lambda x: nltk.pos_tag(x))
        
    # change
    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize_reviews(self, reviews):
        wnl = WordNetLemmatizer()
        
        def lemmatize_token(token):
            if isinstance(token, tuple) and len(token) == 2:
                word, tag = token
                return (wnl.lemmatize(word, self.get_wordnet_pos(tag)), tag)
                
        return reviews.apply(lambda tokens: [lemmatize_token(token) for token in tokens])

In [12]:
df_product_reviews["review_preprocessed"] = Preprocess().main(df_product_reviews["review"])

df_product_reviews[["product", "review", "review_preprocessed"]]

Unnamed: 0,product,review,review_preprocessed
0,Nokia 6610,"excellent phone , excellent service .","[(excellent, JJ), (phone, NN), (, NNP), (excel..."
1,Nokia 6610,i am a business user who heavily depend on mob...,"[(business, NN), (user, VBD), (heavily, RB), (..."
2,Nokia 6610,there is much which has been said in other rev...,"[(much, RB), (say, VBD), (reviews, JJ), (featu..."
3,Nokia 6610,just double check with customer service to ens...,"[(double, RB), (check, VB), (customer, NN), (s..."
4,Nokia 6610,after several years of torture in the hands of...,"[(several, JJ), (year, NNS), (torture, JJ), (h..."
...,...,...,...
10915,Canon PowerShot SD500,There were many times when I clicked the autom...,"[(many, JJ), (time, NNS), (click, VBD), (autom..."
10916,Canon PowerShot SD500,It seems you have to double click some of the ...,"[(seem, VBZ), (double, JJ), (click, JJ), (time..."
10917,Canon PowerShot SD500,Huh? I missed out on lots of photos and lots a...,"[(huh, NN), (, NNS), (miss, VBD), (lot, NNS), ..."
10918,Canon PowerShot SD500,"The controls are hard to manipulate, especiall...","[(control, NNS), (hard, JJ), (manipulate, NN),..."


## Frequent Feature Identification

In [25]:
pd.set_option('display.max_row', 100)
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from collections import defaultdict
    
def extract_potential_features(pos_tagged_reviews):
    return [[word for word, pos in review if pos.startswith('NN') or pos.startswith('NG')] for review in pos_tagged_reviews]

frequent_features = {}

for product_name, group in df_product_reviews.groupby('product'):
    reviews_df = group['review_preprocessed'].tolist()
    reviews = extract_potential_features(reviews_df)
    te = TransactionEncoder()
    te_ary = te.fit(reviews).transform(reviews)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
    print(frequent_itemsets, 'frequent_itemsets')
    
    # From here onwards it looks like we perform pruning?
    # double checked and this looks to be compactness pruning. We just want to find two or more words that appear close
    # to each 
    frequent_items = frequent_itemsets['itemsets'].tolist()
    frequent_compact_items = defaultdict(lambda: 0)
    reviews = group['review'].tolist()
    reviews = [review.split() for review in reviews]
    for ind, review in enumerate(reviews):
        for items in frequent_items:
            index_of_items = []
            for item in items:
                if item in review:
                    index_of_items.append(review.index(item))
            if len(index_of_items) == len(items) and max(index_of_items) - min(index_of_items) < 3:
                frequent_compact_items[items] += 1
    
    result_items = [item for item, val in frequent_compact_items.items() if val > 2]
    
    multi_word_item = {}
    for item, val in frequent_compact_items.items():
        if len(item) > 1:
           multi_word_item[item] = val
    p_support = defaultdict(lambda:[])
    for item, val in multi_word_item.items():
        for itm in item:
            p_support[itm].append(val)
    
    min_p_support = 3
    for item, counts in p_support.items():
        if min(counts) <= min_p_support and frozenset([item]) in result_items:
            result_items.remove(frozenset([item]))

    frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(result_items)]
    frequent_items =  frequent_itemsets['itemsets'].tolist()
    tags = []
    for ind, review in enumerate(reviews):
        tag = []
        
        for items in frequent_items:
            count = 0
            for item in items:
                if item in review:
                    count += 1
            if count == len(items):
                tag.append(items)
        tags.append(tag)

    group['tags'] = tags
    frequent_features[product_name] = frequent_itemsets

for product, df in frequent_features.items():
    print(f"Frequent itemsets for {product}:")
    sorted_df = df.sort_values(by='support')
    print(sorted_df)
    print("\n")

     support                itemsets
0   0.021454                (ad2600)
1   0.017878                (amazon)
2   0.060787                  (apex)
3   0.015495                 (brand)
4   0.025030                (button)
5   0.010727                   (buy)
6   0.013111                    (cd)
7   0.022646             (christmas)
8   0.010727               (control)
9   0.027414              (customer)
10  0.021454                   (day)
11  0.026222                  (disc)
12  0.010727                  (disk)
13  0.015495               (display)
14  0.175209                   (dvd)
15  0.019070            (everything)
16  0.035757               (feature)
17  0.013111                (format)
18  0.016687                   (get)
19  0.014303                  (gift)
20  0.011919                  (hour)
21  0.010727                   (lot)
22  0.020262               (machine)
23  0.014303                 (model)
24  0.026222                 (money)
25  0.033373                 (month)
2

In [40]:
pd.set_option('display.max_row', 100)
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from collections import defaultdict

def extract_potential_features(pos_tagged_reviews):
    return [[word for word, pos in review if pos.startswith('NN') or pos.startswith('NG')] for review in pos_tagged_reviews]

# Initialize a dictionary to hold the original frequent itemsets DataFrames for each product
original_frequent_features_dict = {}

for product_name, group in df_product_reviews.groupby('product'):
    reviews_df = group['review_preprocessed'].tolist()
    reviews = extract_potential_features(reviews_df)
    
    te = TransactionEncoder()
    te_ary = te.fit(reviews).transform(reviews)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
    
    # Store the original frequent itemsets DataFrame in the dictionary
    original_frequent_features_dict[product_name] = frequent_itemsets
    
original_frequent_features_dict

{'Apex AD2600 Progressive-scan DVD player':       support                itemsets
 0    0.880810                      ()
 1    0.021454                (ad2600)
 2    0.017878                (amazon)
 3    0.060787                  (apex)
 4    0.015495                 (brand)
 ..        ...                     ...
 126  0.014303     (player, apex, dvd)
 127  0.013111     (player, dvd, play)
 128  0.011919  (problem, player, dvd)
 129  0.011919   (, player, apex, dvd)
 130  0.011919   (, player, dvd, play)
 
 [131 rows x 2 columns],
 'Canon G3':       support               itemsets
 0    0.809969                     ()
 1    0.012461                 (auto)
 2    0.035826              (battery)
 3    0.012461               (button)
 4    0.010903                  (buy)
 ..        ...                    ...
 162  0.012461     (, image, quality)
 163  0.018692   (, picture, quality)
 164  0.014019       (, point, shoot)
 165  0.014019    (camera, g3, canon)
 166  0.014019  (, camera, g3, c

## Compactness Pruning

In [63]:
# Initialize a dictionary to hold the compactness-pruned frequent itemsets DataFrames
compactness_pruned_features_dict = {}

# Compactness Pruning Process
for product_name, frequent_itemsets in original_frequent_features_dict.items():
    
    frequent_items = frequent_itemsets['itemsets'].tolist()
    frequent_compact_items = defaultdict(lambda: 0)
    
    group = df_product_reviews[df_product_reviews['product'] == product_name]
    reviews = group['review'].tolist()
    reviews = [review.split() for review in reviews]
    
    for ind, review in enumerate(reviews):
        for items in frequent_items:
            index_of_items = []
            for item in items:
                if item in review:
                    index_of_items.append(review.index(item))
            if len(index_of_items) == len(items) and max(index_of_items) - min(index_of_items) < 3:
                frequent_compact_items[items] += 1
    
    # Prune items that appear less frequently
    result_items = [item for item, val in frequent_compact_items.items() if val > 2]
    
    # Handle multi-word items and their partial support
    multi_word_item = {}
    for item, val in frequent_compact_items.items():
        if len(item) > 1:
           multi_word_item[item] = val
           
    p_support = defaultdict(lambda:[])
    for item, val in multi_word_item.items():
        for itm in item:
            p_support[itm].append(val)
    
    min_p_support = 3
    for item, counts in p_support.items():
        if min(counts) <= min_p_support and frozenset([item]) in result_items:
            result_items.remove(frozenset([item]))
    
    # Update the frequent_itemsets DataFrame to keep only the pruned itemsets
    pruned_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(result_items)]
    
    # Store the pruned itemsets DataFrame in the separate dictionary
    compactness_pruned_features_dict[product_name] = pruned_itemsets
    
    # Tagging logic: tag the reviews with the pruned itemsets
    tags = []
    for review in reviews:
        tag = []
        for items in result_items:
            count = 0
            for item in items:
                if item in review:
                    count += 1
            if count == len(items):
                tag.append(items)
        tags.append(tag)
    
    # Update the original DataFrame with the tags
    # df_product_reviews.loc[df_product_reviews['product'] == product_name, 'tags'] = tags
compactness_pruned_features_dict['Apex AD2600 Progressive-scan DVD player']

Unnamed: 0,support,itemsets
1,0.021454,(ad2600)
2,0.017878,(amazon)
3,0.060787,(apex)
4,0.015495,(brand)
5,0.02503,(button)
6,0.010727,(buy)
7,0.013111,(cd)
8,0.022646,(christmas)
9,0.010727,(control)
10,0.027414,(customer)


## Redundancy Pruning

The following is a summary of the algorithm described in Hu & Liu (n.d) which was also called redundancy pruning. The purpose of the algorithm in this subsystem is to reduce the number of features with just a single word identified in the Frequent Feature Identification step since many of these will be irrelevant.

We first find the p-support value for each feature found in the Frequent Feature Identification step, which is just a count of the number of times a feature appears in the reviews and titles. A minimum p-support threshold is then set to prune features that fall below the threshold. Then, for each feature that is just a single word - check its p-support value and if it is below the threshold and is a subset of a feature with two or more words in it then it is pruned.


In [76]:
min_p_support_threshold = 3
redundancy_pruned_compactness_features_dict = {}

for product_name, frequent_itemsets in compactness_pruned_features_dict.items():
    p_support = defaultdict(lambda: 0)
    reviews = df_product_reviews[df_product_reviews['product'] == product_name]['line']
    
    for itemset in frequent_itemsets['itemsets']:
        count = reviews.apply(lambda x: all(word in x for word in itemset)).sum()
        p_support[itemset] = count
    
    pruned_itemsets = []
    multi_word_itemsets = [itemset for itemset in p_support if len(itemset) > 1]
    
    for itemset in p_support:
        if len(itemset) == 1: 
            is_subset = any(itemset.issubset(multi_word_itemset) for multi_word_itemset in multi_word_itemsets)
            if p_support[itemset] < min_p_support_threshold and is_subset:
                continue
        pruned_itemsets.append(itemset)
    
    pruned_itemsets_df = frequent_itemsets[frequent_itemsets['itemsets'].isin(pruned_itemsets)]
    redundancy_pruned_compactness_features_dict[product_name] = pruned_itemsets_df

redundancy_pruned_compactness_features_dict['Apex AD2600 Progressive-scan DVD player']

Unnamed: 0,support,itemsets
1,0.021454,(ad2600)
2,0.017878,(amazon)
3,0.060787,(apex)
4,0.015495,(brand)
5,0.02503,(button)
6,0.010727,(buy)
7,0.013111,(cd)
8,0.022646,(christmas)
9,0.010727,(control)
10,0.027414,(customer)


we then store the redundancy_pruned_compactness_features_dict in the feature set so they can be used later on

In [82]:
feature_sets = copy.deepcopy(redundancy_pruned_compactness_features_dict)

In [21]:
tmp1 = df_product_reviews[df_product_reviews['product'] == 'Apex AD2600 Progressive-scan DVD player']
tmp1.loc[6498]

product                          Apex AD2600 Progressive-scan DVD player
line                   ##before you try to return the player or waste...
annotated_labels                                                        
review                 before you try to return the player or waste h...
review_preprocessed    [(try, VB), (return, NN), (player, NN), (waste...
Name: 6498, dtype: object

## Opinion word Extraction

First I'll need to get all of the frequent features in each review. From there, I'll then need to get 

In [83]:
class OpinionWordExtractor:
    def __init__(self):
        self.feature_sets = feature_sets
        
    def main(self, products, reviews_preprocessed):
        feature_words_in_reviews = self.extract_feature_words_in_reviews(products, reviews_preprocessed)
        {feature: 'x', opinion: 'y'}
        
    def extract_feature_words_in_reviews(self, reviews_preprocessed):
        return reviews_preprocessed.apply(lambda review: [])
        
    def extract_potential_opinion_words(self, reviews_preprocessed):
        return reviews_preprocessed.apply(lambda review: [word for word, pos in review if pos.startswith('JJ')])

In [113]:
import nltk
from nltk import pos_tag, word_tokenize

class OpinionWordExtractor:
    def __init__(self):
        # Assuming feature_sets is predefined or passed in some way
        self.feature_sets = feature_sets
        
    def main(self, df_product_reviews):
        return df_product_reviews.apply(
            lambda row: self.extract_opinion_words_in_review(
                row['review_preprocessed'], row['product']
            ), axis=1
        )
    
    def extract_opinion_words_in_review(self, review_preprocessed, product):
        feature_words = self.extract_feature_words_in_review(review_preprocessed, product)
        opinion_words = self.extract_potential_opinion_words(review_preprocessed, feature_words)
        
        return opinion_words

    def extract_feature_words_in_review(self, review_preprocessed, product):
        feature_for_product = self.feature_sets[product]['itemsets']
        
        # Convert the review_preprocessed into a list of words for easy matching
        review_words = [word for word, pos in review_preprocessed]
        
        # Find features in the review
        found_features = []
        for itemset in feature_for_product:
            if all(word in review_words for word in itemset):
                found_features.append(' '.join(itemset))

        return found_features

    def extract_potential_opinion_words(self, tagged_tokens, features_in_review):
        opinion_dict = {}
        
        for feature in features_in_review:
            # Find the index of the feature in the tagged tokens
            feature_indices = [i for i, (word, pos) in enumerate(tagged_tokens) if word in feature.split()]
            
            adjectives = []
            for feature_index in feature_indices:
                # Find adjectives (JJ) within a proximity of 3 tokens around the feature
                nearby_adjectives = [word for i, (word, pos) in enumerate(tagged_tokens)
                                     if pos.startswith('JJ') and abs(i - feature_index) <= 3]
                adjectives.extend(nearby_adjectives)
            
            # Add the feature and its associated adjectives to the opinion dictionary
            if adjectives:
                opinion_dict[feature] = adjectives
        
        return opinion_dict

df_product_reviews["opinion_words"] = OpinionWordExtractor().main(df_product_reviews)

df_product_reviews[["product", "review_preprocessed", "opinion_words"]]

Unnamed: 0,product,review_preprocessed,opinion_words
0,Nokia 6610,"[(excellent, JJ), (phone, NN), (, NNP), (excel...",{'service': ['excellent']}
1,Nokia 6610,"[(business, NN), (user, VBD), (heavily, RB), (...",{'service': ['mobile']}
2,Nokia 6610,"[(much, RB), (say, VBD), (reviews, JJ), (featu...","{'feature': ['reviews', 'great'], 'problem': [..."
3,Nokia 6610,"[(double, RB), (check, VB), (customer, NN), (s...",{'number': ['amazon']}
4,Nokia 6610,"[(several, JJ), (year, NNS), (torture, JJ), (h...",{'customer': ['torture']}
...,...,...,...
10915,Canon PowerShot SD500,"[(many, JJ), (time, NNS), (click, VBD), (autom...",{}
10916,Canon PowerShot SD500,"[(seem, VBZ), (double, JJ), (click, JJ), (time...",{}
10917,Canon PowerShot SD500,"[(huh, NN), (, NNS), (miss, VBD), (lot, NNS), ...",{'lot': ['blurry']}
10918,Canon PowerShot SD500,"[(control, NNS), (hard, JJ), (manipulate, NN),...",{}


# 2. Extract Relevant Information

Here I need to do two algorithms and see what the results are. ABSA or whatever it's called and then one unsupervised approach

# 3. Report Evaluation results