In [5]:
import glob
import urllib.parse
import bs4
import dateparser
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import re
import spacy
import gensim.corpora as corpora
from pprint import pprint
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import numpy as np
from gensim.models.wrappers import LdaMallet
import os 

## Code given before assignment:

In [6]:
threads = []
stop_words = stopwords.words('english')
stop_words.extend(['to', 'on', 'a', 'i', 'is', 'my', 'the', 'it', 'and', 'or', 'at'])
for i,fname in enumerate(glob.glob("data/*")):
    with open(fname, "rb") as f:
        soup = bs4.BeautifulSoup(f)
        if not soup.select(".Message"):
            continue
        
        users = soup.find_all(class_="Author")
        author = users[0].get_text()

        comments = []
        document = ''
        for message in soup.select(".Message"):
            comment = dict(content=message.text.strip())
            document += ' ' + message.text.strip()
            comments.append(comment)
        
        question = comments[0]['content']
        # Process, remove punctuation, tokenize, set to lower case and lemmatize
        processed_question = [lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(re.sub('[,.!?)(:<>\'-]', '', question)) if w.lower() not in stop_words]
        processed_document = [lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(re.sub('[,.!?)(:<>\'-]', '', document)) if w.lower() not in stop_words]
            
        threads.append(
            dict(
                filepath=fname,
                soup=soup,
                url="https://" + urllib.parse.unquote(fname).split("https://")[1],
                created_at=dateparser.parse(soup.select(".DateCreated")[0].text.split("Posted on")[1]),
                title=soup.select("h3")[0].text.strip(),
                author_username=author,
                comments=comments,
                processed_question = processed_question,
                processed_document = processed_document
            )
        )
        

### Summarise threads from 2019 into, both processed and original:

In [7]:
# Create list with all processed threads from 2019:
processed_questions = []
processed_docs = []
questions = []
rest_of_thread = []
for thread in threads:
    # Skip threads if not from 2019
    if thread["created_at"].year != 2019:
        continue
    processed_docs.append(thread["processed_document"])
    processed_questions.append(thread["processed_question"])
    questions.append(thread["comments"][0]['content'])
    answers = ''
    for comment in thread["comments"][1:]:
        answers += '\n' + '- ' + comment['content']
    try:
        rest_of_thread.append(answers)
    except:
        rest_of_thread.append('')

## Task 1 Exploration

### 1 a) List all threads with problems of connecting TV to WI-FI:

In [8]:
wifi_count = 0
for thread_id, doc in enumerate(processed_questions):
    if 'wifi' in doc and 'tv' in doc:
        #print('\n' + 50*'*' + ' Thread ' + str(thread_id) +' ' + 50*'*')
        #print('-', questions[thread_id])
        #print(100*'-')
        #print('-', rest_of_thread[thread_id][2:])
        wifi_count += 1
print('Number of threads about wifi connection: ', wifi_count)

Number of threads about wifi connection:  25


#### Manual check concludes that only the questions below actually concern connecting to wifi:

In [9]:
connecting_to_wifi = [49, 92, 113, 124, 174, 199, 271, 287, 292, 396, 428, 432, 468, 477]
for thread in connecting_to_wifi:
    print('\n' + 50*'*' + ' Thread ' + str(thread) +' ' + 50*'*')
    print('-', questions[thread])
    print(100*'-')
    print('-', rest_of_thread[thread][2:])


************************************************** Thread 49 **************************************************
- HiI have an lg smart tv I bought it a while ago, and now The Wi-Fi doesn't connect to my router at the same time. The air mouse doesn't work. Pleaee help me.
----------------------------------------------------------------------------------------------------
-  Yousuf_Ayad18, what is the model number to your TV?

************************************************** Thread 92 **************************************************
- I bought the 43UK6400PLF yesterday. The TV has serious issues to connect to WiFi on a Virgin Media Hub 3.0 router. Sometimes it will not connect at all, sometimes it connects but keeps dropping. All my other devices connect to the router without any problem on 2.4Ghz and 5GHz. (I tested the Amazon Fire Stick, two Samsung mobiles phones, iPad, Lenovo Laptop and all connect fine).Here is what I did so far:- Reseted the router.- Updated the TV to the late

### 1 b) Responses that would have a one-in-four chance of being accepted:
#### Given the sucess metrics defined in the assignment only thread 292 would be accepted:
"Monika, if your cellphone has Hotspot, turn it on and try connecting the TV to it. If the TV connects to the hotspot, then reset the router again. If it doesn't connect to your hotspot, service would be needed to repair."

### 1 c) Best answer string for wifi connection problems:

"Dear customer, if your cellphone has Hotspot, turn it on and try connecting the TV to it. If it doesn't connect to your hotspot, or the connection is still inconsistent, please contact us to schedule a service appointment. If the TV does connect to your hotspot, and even after resetting the router it won't connect to wifi, please contact your network provider."

### 1 d) Threads that would accept answer above:
The answer above might be accepted by in threads: 113, 124, 287, 292, 396, 428, 432, 477


## Task 2 Prototype

### I start by moddelling different topics that occur in all threads from 2019 using LDA.

In [10]:
## Create Dictionary
id2word = corpora.Dictionary(processed_questions)
# Create Corpus
texts = processed_questions
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

#### Create LDA model:

In [32]:
# Using the Mallet LDA package
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# Install Mallet and unzip in directory
# This is needed: (Might need to restart kernel)
os.environ.update({'MALLET_HOME':r'C:/Users/jonastjomsland/mavenoid/mallet-2.0.8/'})
# And move mallet to that location (this work for mee, took some time to figure out)
mallet_path = '/Users/jonastjomsland/mavenoid/mallet-2.0.8/bin/mallet' 

n_topics = 20
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=n_topics, id2word=id2word)

#### Assign every question to its most probable topic and list the keywords of that topic:

In [33]:
# Taken from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=question):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=questions)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
display(df_dominant_topic)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,14.0,0.0904,"tv, lg, bought, cable, hooked, 55, yesterday, ...",2019 TV 82UM8070PUA does not recognize Audio D...
1,1,3.0,0.1511,"time, year, month, day, xbox, ago, game, start...",I got my TV (LG55um7400plb) a couple of days a...
2,2,18.0,0.1009,"’, app, lg, hulu, find, content, webos, suppor...",looking at model 49UM69. wondering if anyone ...
3,3,9.0,0.0958,"tv, turn, receiver, turned, back, manually, au...",Have HDD to 60JS8000 TV USB port. HDD has two...
4,4,18.0,0.0989,"’, app, lg, hulu, find, content, webos, suppor...",I just purchase a LG tv and I can’t get the AT...
...,...,...,...,...,...
474,474,17.0,0.0945,"show, setting, device, reset, menu, access, we...",I have an UM7300 series HDTV that I signed up ...
475,475,5.0,0.0922,"tv, connection, 1, hd, ultra, streaming, inter...","HiWe have a 2017 model 42"" LG WebOS 4K HD TV. ..."
476,476,6.0,0.0758,"hdmi, sound, soundbar, arc, audio, connected, ...",Once sound is muted the remote becomes unrespo...
477,477,8.0,0.0944,"tv, connect, connected, wifi, device, laptop, ...",Gave up on using the wifi on this expensive pi...


#### Find the most frequent topics:

In [34]:
# Number of Documents for Each Topic
topic_counts = df_dominant_topic.pivot_table(index=['Dominant_Topic'], aggfunc='size')

# Keywords in each topic
topic_keywords = []
for topic in range(n_topics):
    topic_keywords.append(df_dominant_topic[df_dominant_topic.Dominant_Topic == topic].Keywords.unique())
    
# Most frequent topics and keywords:
df_frequent_topics = pd.DataFrame(columns = ['Number of threads in topic', 'Keywords'])
df_frequent_topics['Number of threads in topic'] = topic_counts.values
df_frequent_topics['Keywords'] = topic_keywords
df_frequent_topics = df_frequent_topics.sort_values(by=['Number of threads in topic'], ascending=False)
df_frequent_topics = df_frequent_topics.reset_index()
#df_frequent_topics = df_frequent_topics.iloc[:,1:]
display(df_frequent_topics.head(6))
#print(df_frequent_topics.Keywords[0:6].values)

Unnamed: 0,index,Number of threads in topic,Keywords
0,2,44,"[remote, button, box, set, magic, power, contr..."
1,11,42,"[screen, picture, dark, black, light, image, i..."
2,18,41,"[’, app, lg, hulu, find, content, webos, suppo..."
3,7,36,"[problem, video, netflix, watching, update, am..."
4,6,32,"[hdmi, sound, soundbar, arc, audio, connected,..."
5,8,29,"[tv, connect, connected, wifi, device, laptop,..."


#### Investigate the questions and answers of the most frequent topics:

In [44]:
# Skip most frequent topic number 4 and use the next one instead, too similar to topic 1 
# - (Could maybe have been solved with further tuning of n_topics)
top_5_topics = [2, 11, 18, 6, 8]
topic_df = df_dominant_topic.loc[df_dominant_topic['Dominant_Topic'] == top_5_topics[0]]
#for index, row in topic_df.iterrows():
    #print(40*'*' + ' Thread id: ' + str(row['Document_No']) + ' ' + 40*'*')
    #print(row['Text'])
    #print(rest_of_thread[index])
    #print(100*'-')

### 2 a) 5 answer strings:
Manually looking through some of the answers in the top five topics, landed on the following answer strings for the different topics.

#### Topic 2 - Regarding issues with a remote, often the Magic remote.
    Keywords : ['remote, button, magic, work, control, working, volume, brand, press, replace']
    Answer: 
    "Hi, there seems to be a problem with your remote control. Are you able to control what you want using the buttons on the tv? If so, try changing the batteries in your remote. If the problem is still there take a look at the manual for your remote and check whether it is compatible with your TV here: LINK"
    
#### Topic 11 - Related to picture quality and settings
    Keywords : ['screen, year, black, month, dark, ago, image, 5, white, left']
    Answer: 
    "Hi, you seem to have a problem with your screen or picture. If you have a crack or any damage on the screen it will most likely require service. That is also the case if the problems can't be fixed by tuning some of the picture settings. If you haven't tried this, they can be found at Settings > All Settings > Picture > Picture Options"
    
#### Topic 18 - About which apps and 3rd party software TVs support
    Keywords : ['lg, app, webos, access, support, store, web, service, voice, version']
    Answer: 
    "Hi, you seem to have troubles using an app or other 3rd party software with your device. Please check out the list of 3rd party software we support as well as how to enable them here: LINK"

#### Topic 6 - Related to audio connection and speakers
    Keywords : ['sound, hdmi, soundbar, arc, audio, receiver, connected, bluetooth, speaker, bar']
    Answer: 
    "Hi, there seems to be a problem with an audio input, are you using the correct mode in Settings > All Settings > Sound > Sound Out? For example HDMI-ARC, Optical, or Bluetooth. If using ARC, ensure the SIMPLINK is turned on under Audio Out (Optical/HDMI ARC) to activate ARC on the TV. However, if using bluetooth, we recommend turning off SIMPLINK. If nothing of the above works when using HDMI, you can perform a reset on the HDMI ports by pressing the MUTE nutton on the LG remote 3 times. It will pull up a menu on the bottom of the screen and you would need to select AV RESET. Note that this resets the picture settings."
    
#### Topic 8 - Related to connection, either to wifi or other devices 
    Keywords : ['device, connect, tv, wifi, connected, connection, laptop, phone, router, find']
    Answer: 
    "Hi, it seems like you have problems connecting your device to either wifi or another device. 
    If wifi troubles and you have a cellphone with a hotspot, turn it on and try connecting the TV to it. If it doesn't connect to your hotspot, or the connection is still inconsistent, please contact us to schedule a service appointment. If the TV does connect to your hotspot, and even after resetting the router it won't connect to wifi, please contact your network provider. If connecting to other devices is the issue, please se the manual for connecting to other devices: LINK"
   

### 2 b) csv with answers to all threads:

In [39]:
# The 5 bot answers:
bot_answers = [" It seems like you have problems connecting your device to either wifi or another device. If wifi troubles, and you have a cellphone with a hotspot, turn it on and try connecting the TV to it. If it doesn't connect to your hotspot, or the connection is still inconsistent, please contact us to schedule a service appointment. If the TV does connect to your hotspot, and even after resetting the router it won't connect to wifi, please contact your network provider. If connecting to other devices is the issue, please se the manual for connecting to other devices: LINK",
           "Hi, you seem to have troubles using an app or other 3rd party software with your device. Please check out the list of 3rd party software we support as well as how to enable them here: LINK",
           "Hi, there seems to be a problem with your remote control. Are you able to control what you want using the buttons on the tv? If so, try changing the batteries in your remote. If the problem is still there take a look at the manual for your remote and check whether it is compatible with your TV here: LINK",
           "Hi, you seem to have a problem with your screen or picture. If you have a crack or any damage on the screen it will most likely require service. That is also the case if the problems can't be fixed by tuning some of the picture settings. If you haven't tried this, they can be found at Settings > All Settings > Picture > Picture Options",
           "Hi, there seems to be a problem with an audio input, are you using the correct mode in Settings > All Settings > Sound > Sound Out >? For example HDMI-ARC, Optical, or Bluetooth. If using ARC, ensure the SIMPLINK is turned on under Audio Out (Optical/HDMI ARC) to activate the ARC on the TV. However, if using bluetooth, we recommended to turn off SIMPLINK. If nothing of the above works when using HDMI, you can perform a reset on the HDMI ports by pressing the MUTE nutton on the LG remote 3 times. It will pull up a menu on the bottom of the screen and you would need to select AV RESET. Note that this resets the picture settings."]
bot_answer_topics = [8, 18, 0, 11, 6]
# Dataframe
answer_all_threads = pd.DataFrame(columns = ['url', 'Question', 'Answer nr.'])
# Iterate over questions
for i, row in enumerate(ldamallet[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    assigned_topic = row[0][0]
    if assigned_topic in bot_answer_topics:
        answer = assigned_topic
    else:
        answer = '-'
    answer_all_threads.loc[i] = [threads[i]['url'], questions[i], answer]
answer_all_threads.to_csv(r'bot_answers.csv', index = False)

In [40]:
# We see below that we answer 180/479 rows
display(answer_all_threads)
display(answer_all_threads[answer_all_threads['Answer nr.'] != '-'])

Unnamed: 0,url,Question,Answer nr.
0,https://lgcommunity.us.com/discussion/8533/aud...,2019 TV 82UM8070PUA does not recognize Audio D...,-
1,https://lgcommunity.us.com/discussion/8641/ver...,I got my TV (LG55um7400plb) a couple of days a...,-
2,https://lgcommunity.us.com/discussion/11398/yo...,looking at model 49UM69. wondering if anyone ...,18
3,https://lgcommunity.us.com/discussion/2579/tru...,Have HDD to 60JS8000 TV USB port. HDD has two...,-
4,https://lgcommunity.us.com/discussion/1455/tv-...,I just purchase a LG tv and I can’t get the AT...,18
...,...,...,...
474,https://lgcommunity.us.com/discussion/214/cann...,I have an UM7300 series HDTV that I signed up ...,-
475,https://lgcommunity.us.com/discussion/1627/con...,"HiWe have a 2017 model 42"" LG WebOS 4K HD TV. ...",-
476,https://lgcommunity.us.com/discussion/2742/can...,Once sound is muted the remote becomes unrespo...,6
477,https://lgcommunity.us.com/discussion/2943/syn...,Gave up on using the wifi on this expensive pi...,8


Unnamed: 0,url,Question,Answer nr.
2,https://lgcommunity.us.com/discussion/11398/yo...,looking at model 49UM69. wondering if anyone ...,18
4,https://lgcommunity.us.com/discussion/1455/tv-...,I just purchase a LG tv and I can’t get the AT...,18
8,https://lgcommunity.us.com/discussion/12291/ce...,I noticed recently the firmware was updated to...,6
9,https://lgcommunity.us.com/discussion/9219/sma...,I have a LG UF7700 I purchased at the beginnin...,18
10,https://lgcommunity.us.com/discussion/8515/can...,I am trying to figure out how to cast or mirro...,8
...,...,...,...
467,https://lgcommunity.us.com/discussion/8583/scr...,I can’t find the Disney Plus app on my LG Cont...,18
468,https://lgcommunity.us.com/discussion/2554/uk7...,"Hello, My tv decided to disconnect from the in...",8
471,https://lgcommunity.us.com/discussion/5432/dis...,The customers sent you the questions but didn’...,18
476,https://lgcommunity.us.com/discussion/2742/can...,Once sound is muted the remote becomes unrespo...,6


#### Investigate some of the questions and answers the bot can give:

In [42]:
# Investigate some of them
print('Question:')
print(answer_all_threads.iloc[2,:].Question)
print('Answer:')
print(bot_answers[bot_answer_topics.index(answer_all_threads.iloc[2,:]['Answer nr.'])])
print(100*'-')
print('Question:')
print(answer_all_threads.iloc[8,:].Question)
print('Answer:')
print(bot_answers[bot_answer_topics.index(answer_all_threads.iloc[8,:]['Answer nr.'])])
print(100*'-')
print('Question:')
print(answer_all_threads.iloc[10,:].Question)
print('Answer:')
print(bot_answers[bot_answer_topics.index(answer_all_threads.iloc[10,:]['Answer nr.'])])

Question:
looking at model 49UM69.  wondering if anyone knows if it supports Hulu live tv, or just the legacy Hulu app?thanks
Answer:
Hi, you seem to have troubles using an app or other 3rd party software with your device. Please check out the list of 3rd party software we support as well as how to enable them here: LINK
----------------------------------------------------------------------------------------------------
Question:
I noticed recently the firmware was updated to 4.26.00. Does anybody know what was fixed? I've been having an issue with the HDMI ports not being recognized as HDCP 2.2 compliant. I have the 60" 4K model number 60UF7300
Answer:
Hi, there seems to be a problem with an audio input, are you using the correct mode in Settings > All Settings > Sound > Sound Out >? For example HDMI-ARC, Optical, or Bluetooth. If using ARC, ensure the SIMPLINK is turned on under Audio Out (Optical/HDMI ARC) to activate the ARC on the TV. However, if using bluetooth, we recommended to

## Task 3 Business case
- Given the findings above I would say there are some value in implementing a simple bot based on topic moddeling. If the client is comfortable with the bot's answers being rejected ten out of eleven times, then the proposed solution would take of some of the load from the support team. Some of the decisions made might be a bit conservative, I would love to elaborate during the interviews. 
- I do think it would be difficult to automate the answers with the solution I have proposed, we automate the topics but a good answers for every topic is more challenging to generate.
- I think there are multiple aspects regarding the value that such a solution generates. 
        - First, assuming that 50 of 180 answers the bot gives is accepted (maybe optimistic) then it handles 10% of questions. It could be argued that the company could save at least 10% in labor costs on the support team. This number would most likely be higher because the bot could operate 24/7 and therefore handled substantially more than a normal support worker.
        - There are also multiple other forms of value that such a solution could generate, including improved customer satisfaction from quick replies and happier support agents. 
