In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

In [2]:
PATH = os.getcwd()

filesArray = os.listdir(f'{PATH}/archive')
csvFiles = [file for file in filesArray if file.endswith(".csv")]
csvFiles.sort()

In [3]:
csvFiles

['inchcapeReviews1.csv',
 'inchcapeReviews2.csv',
 'inchcapeReviews3.csv',
 'inchcapeReviews4.csv']

#### combine to one dataframe

In [4]:
# Combine into one data frame
data = []
for i in csvFiles:
    data.append(pd.read_csv(f'{PATH}/archive/{i}'))

reviewsRaw = pd.concat(data, ignore_index=True)

columnNames = {'Date of Exp' : 'dateExp', 'Star Rating' : 'starRating', 'Reviews': 'reviews'}
reviewsRaw = reviewsRaw.rename(columns=columnNames)

reviewsRaw['dateExp'] = pd.to_datetime(reviewsRaw['dateExp'], format = 'mixed')
reviewsRaw.head()

Unnamed: 0,dateExp,starRating,reviews
0,2023-05-09,1,The garage refused to investigate a warrantee ...
1,2023-05-25,5,George Eden was fantastic. A pleasure to deal ...
2,2023-05-23,1,Extremely poor service from Sandhurst Inchcape...
3,2023-05-19,5,As always I have had a very satisfying visit t...
4,2023-05-18,5,The customer agent Brook was superb. I found b...


### maybe include date filter here?

treat onedate = onedocument

topics modelling will be per day (or according to desired range)

In [5]:
# Filter the DataFrame to get only the rows from 2023
reviews = reviewsRaw.loc[reviewsRaw['dateExp'].dt.year >= 2022].copy()
reviews.shape

(1327, 3)

> By using the .loc accessor and the .copy() method, you explicitly indicate that you want to modify a specific subset of the DataFrame, avoiding the warning

## Data Preprocessing

#### remove words with just two words

In [6]:
reviews = reviews[reviews['reviews'].str.split().str.len() > 2]
reviews = reviews.reset_index(drop=True)
reviews.shape

(1300, 3)

In [7]:
print(reviewsRaw.shape, reviews.shape, sep = '\n')

(14809, 3)
(1300, 3)


#### recode starRating

In [None]:
reviews['starRatingRecode'] = reviews['starRating'].apply(lambda x: 'positive' if x >=4 else ('neutral' if x==3 else 'negative'))
reviews.head()
#print(reviews.shape)

#### remove puctuations & convert to lowercase

In [8]:
import re

def lowerCase_removePunc(dataframeName, columnName):
    dataframeName[columnName] = dataframeName['reviews'].map(lambda text: re.sub(r'[,\.!?]', '', text))
    dataframeName[columnName] = dataframeName[columnName].map(lambda text: text.lower())    
    return dataframeName


#reviews = lowerCase_removePunc(reviews, 'processedText')
#reviews.head()

#### tokenization

In [None]:
content = (reviews['processedText'])

In [10]:
import nltk
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

#processedText = content.apply(tokenize)
#processedText.head()

#### remove stopwords

In [11]:
from nltk.corpus import stopwords
stopwords  = stopwords.words('english')
#stopwords

def removeStopwords(tokenizedText):
    filteredTokens = [token for token in tokenizedText if token.lower() not in stopwords]
    return filteredTokens

#processedText = processedText.apply(removeStopwords)
#processedText.head()

#### wordcloud

In [None]:
from wordcloud import WordCloud

longString = ' '.join(processedText.apply(lambda x: ' '.join(x))) #combine into one long text

In [None]:
wordcloud = WordCloud(scale = 3,
        background_color='white',
        max_words=250,
        max_font_size=40,
        colormap='BrBG',
        random_state=42                      
    ).generate(longString)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Save the wordcloud image
wordcloud_image_path = 'wordcloud.png'  # Specify the path and filename
wordcloud.to_file(wordcloud_image_path)
print(f"Wordcloud saved as {wordcloud_image_path}")

#### lemmatized

In [12]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(tokenizedText):
    filteredTokens = [lemmatizer.lemmatize(token) for token in tokenizedText]
    return filteredTokens

#processedText = processedText.apply(lemmatize)
#processedText.head()

### Modelling using LDA-gensim

In [13]:
# STEP 1: Prepare the corpus
import gensim
from gensim.utils import simple_preprocess 
# simple_preprocess:
# tokenization, lowercasing,
# filtering removes tokens that are too short (less than 3 characters) or too long (more than 15 characters).

In [15]:
content = (reviews['reviews'])
corpus = []

def preprocess_corpus(data):
    processed_corpus = data.apply(lambda x: simple_preprocess(x))
    return processed_corpus

#preprocessed_corpus = preprocess_corpus(content)

In [16]:
content.head()

0    The garage refused to investigate a warrantee ...
1    George Eden was fantastic. A pleasure to deal ...
2    Extremely poor service from Sandhurst Inchcape...
3    As always I have had a very satisfying visit t...
4    The customer agent Brook was superb. I found b...
Name: reviews, dtype: object

In [17]:
# Step2: Creating dictionary
from nltk import bigrams
from gensim import corpora, models

def create_bigrams(corpus):
    # Create a list to hold the bigram models
    corpus_bigrams = []

    # Create bigrams for each document in the corpus
    for doc in corpus:
        doc_bigrams = list(bigrams(doc))
        doc_bigrams = [' '.join(bigram) for bigram in doc_bigrams]  # Convert bigrams to strings
        corpus_bigrams.append(doc_bigrams)

    return corpus_bigrams

def create_dict(corpus):
    # create dict using the preprocessed corpus
    dictionary = corpora.Dictionary(corpus)
    
    # create bag-of-words representation of the corpus
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    
    # create tf-idf model and convert the bow vector to tfidf vectors
    tfidf_model = models.TfidfModel(bow_corpus)
    tfidf_corpus = tfidf_model[bow_corpus]
    
    corpus_vecs = tfidf_corpus
    return dictionary, corpus_vecs,tfidf_model

In [18]:
# Apply the preprocessing steps to the corpus
preprocessed_corpus = preprocess_corpus(content)
preprocessed_corpus = preprocessed_corpus.apply(removeStopwords)
preprocessed_corpus = preprocessed_corpus.apply(lemmatize)

# Create the bigrams in the corpus
corpus_bigrams = create_bigrams(preprocessed_corpus)

# Create the dictionary and TF-IDF corpus with bigrams
dictionary, corpus_vecs, tfidf_model = create_dict(corpus_bigrams)

> By incorporating the TF-IDF transformation, you can assign higher weights to terms that are important in a particular document while downweighting terms that are common across multiple documents.

In [19]:
# Step 3: Build LDA model

def train_lda_model(corpus, num_topics):
    # Train lda model on tf-idf corpus
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics=num_topics,
                                id2word=dictionary,
                                passes=20)
    
    return lda_model

In [26]:
%%time
num_topics = 5

# Train the LDA model
lda_model_gensim = train_lda_model(corpus_vecs, num_topics)

# Print the topics and their corresponding keywords
# for topic_id, topic_words in lda_model_gensim.print_topics():
#    print(f"Topic #{topic_id+1}: {topic_words}")

CPU times: user 8.2 s, sys: 64.6 ms, total: 8.26 s
Wall time: 8.27 s


In [21]:
def generate_topics_df(lda_model, corpus_vecs, num_topics):
    # Assign topics to documents
    document_topics = []
    for i, doc in enumerate(corpus_vecs):
        doc_topics = lda_model.get_document_topics(doc)
        document_topics.append([prob for _, prob in doc_topics])

    # Convert document_topics into a DataFrame
    topics_df = pd.DataFrame(document_topics)

    # Rename the columns to represent topics
    topics_df.columns = [f"Topic_{i+1}" for i in range(num_topics)]
    
    return topics_df

In [22]:
topics_df = generate_topics_df(lda_model_gensim, corpus_vecs, num_topics)

In [28]:
# Merge topics_df with existing reviews DataFrame
processedReviews = preprocessed_corpus.rename('processedReviews')

reviewsNtopics = pd.concat([reviews['reviews'], processedReviews, topics_df], axis=1)
reviewsNtopics.head()

Unnamed: 0,reviews,processedReviews,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
0,The garage refused to investigate a warrantee ...,"[garage, refused, investigate, warrantee, faul...",0.032351,0.032379,0.032404,0.032345,0.870521
1,George Eden was fantastic. A pleasure to deal ...,"[george, eden, fantastic, pleasure, deal, comm...",0.038741,0.038458,0.038488,0.038683,0.845629
2,Extremely poor service from Sandhurst Inchcape...,"[extremely, poor, service, sandhurst, inchcape...",0.02679,0.893041,0.026728,0.026735,0.026705
3,As always I have had a very satisfying visit t...,"[always, satisfying, visit, inchcape, toyota, ...",0.025761,0.02618,0.02573,0.025857,0.896472
4,The customer agent Brook was superb. I found b...,"[customer, agent, brook, superb, found, manage...",0.037013,0.03675,0.037153,0.037085,0.852


In [29]:
# Step 4: Analyze topics

# Get the top keywords for each topic
# topics = lda_model_gensim.show_topics(num_topics=num_topics, num_words=20)

In [30]:
# Step 5: Visualize

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import webbrowser

# Visualize the topics
vis_data = gensimvis.prepare(lda_model_gensim, corpus_vecs, dictionary)

# Convert the document-topic assignments to a format suitable for visualization
vis_data = pyLDAvis.gensim_models.prepare(lda_model_gensim, corpus_vecs, dictionary)

pyLDAvis.save_html(vis_data, 'lda_visualization.html')
webbrowser.open('lda_visualization.html', new=2)

True

## Summarize using Hugging Face Models 

In [31]:
from transformers import pipeline

In [None]:
#model_name = "bert-base-uncased"
#model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
#tokenizer = AutoTokenizer.from_pretrained("model_name")
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [32]:
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

In [34]:
ARTICLE = ' '.join(preprocessed_corpus.apply(lambda x: ' '.join(x)))
#ARTICLE = ARTICLE[:1000]
len(ARTICLE)

286048

In [45]:
%%time
print(summarizer(ARTICLE[:5000], max_length=150, min_length=30, do_sample=False))

[{'summary_text': 'There was a six-week wait for a car to be delivered. Garage refused to investigate warrantee fault. The car was delivered on time, but it was damaged. It was repaired and sold to a dealership.'}]
CPU times: user 56.4 s, sys: 26.7 s, total: 1min 23s
Wall time: 11.4 s


## Create summary for each topic

In [46]:
#create separate dataframes for each topic
threshold = 1 / num_topics

topic_dataframes = {}

for column in topics_df.columns:
    topic_reviews = reviewsNtopics[reviewsNtopics[column] >= threshold]  # Filter reviews with score greater than or equal to threshold
    topic_dataframes[column] = topic_reviews

In [47]:
topic_dataframes['Topic_2']

Unnamed: 0,reviews,processedReviews,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
2,Extremely poor service from Sandhurst Inchcape...,"[extremely, poor, service, sandhurst, inchcape...",0.026790,0.893041,0.026728,0.026735,0.026705
11,I purchased a used car from Cheshire Oaks Audi...,"[purchased, used, car, cheshire, oak, audi, se...",0.051484,0.790643,0.053999,0.051937,0.051937
13,"Failed home delivery twice , salesperson faile...","[failed, home, delivery, twice, salesperson, f...",0.045138,0.819447,0.045138,0.045138,0.045138
14,"Had a service @ Landrover, Chester yesterday Y...","[service, landrover, chester, yesterday, yd, n...",0.044727,0.820816,0.044765,0.044738,0.044954
15,Sold a car in the past couple of weeks - Inchc...,"[sold, car, past, couple, week, inchcape, gave...",0.022636,0.908674,0.022896,0.023127,0.022668
...,...,...,...,...,...,...,...
1258,Please take car when buying an older vehicle f...,"[please, take, car, buying, older, vehicle, ga...",0.023976,0.904087,0.024005,0.023966,0.023966
1260,Preston Jaguar Land Rover : One of the worst b...,"[preston, jaguar, land, rover, one, worst, buy...",0.021038,0.916035,0.020963,0.020969,0.020995
1272,Inchcape Stockport VW Garage:\nTerrible Experi...,"[inchcape, stockport, vw, garage, terrible, ex...",0.036236,0.855022,0.036246,0.036181,0.036316
1294,"Best service I have had in general for years, ...","[best, service, general, year, year, lady, ser...",0.033093,0.867390,0.033237,0.033011,0.033269


In [50]:
def generate_random_summary(corpus, length_limit):
    if len(corpus) == 0:
        return "Corpus is empty."

    article = ' '.join(corpus.apply(lambda x: ' '.join(x)))
    #article = ' '.join(corpus)
    article_length = len(article)

    if article_length <= length_limit:
        start_idx = 0
        end_idx = article_length
    else:
        start_idx = random.randint(0, article_length - length_limit)
        end_idx = start_idx + length_limit
    
    article_chunk = article[start_idx:end_idx]
    
    # Generate the summary using the pre-initialized summarizer pipeline
    summary = summarizer(article_chunk, max_length=100, min_length=30, do_sample=False)
    
    return summary[0]['summary_text']

# Function to generate random summaries for each topic
def generate_random_topic_summaries(topic_dataframes, length_limit):
    topic_summaries = {}

    for topic, dataframe in topic_dataframes.items():
        reviews = dataframe['processedReviews']
        random_summary = generate_random_summary(reviews, length_limit)
        topic_summaries[topic] = random_summary

    return topic_summaries

In [51]:
length_limit = 500  # Adjust the length limit as needed
topic_summaries = generate_random_topic_summaries(topic_dataframes, length_limit)

# Display the random summaries
for topic, summary in topic_summaries.items():
    print(f"Summary for {topic}: {summary}")

Your max_length is set to 100, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 100, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 100, but you input_length is only 97. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 100, but you input_length is only 74. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 100, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


Summary for Topic_1: Audi dealer ignored car develops fault and didn't fix it. They called back on Wednesday morning and fixed the problem. The previous dealership had already given previous review.
Summary for Topic_2: Clude warranty provider is refusing to repair bodywork on a customer's car. The last email regarding bodywork damage was sent before the previous service.
Summary for Topic_3: Dan Paul's son bought a new volkswagen at random drop in Altrincham. The car was well-serviced and the service was good.
Summary for Topic_4: The warrington car repair exceeded expectations and the work will be charged. The service engineer working on the car previously failed to reserve part allocated to a different branch, so it had to be reordered and delayed.
Summary for Topic_5: Autotrader advert state bmw series fully prepared ready next journey delivered location collected whenever suit state within mile informed mile away. Richard charge service helpful knowledgeable approachable.
