Information about YAKE!: https://github.com/LIAAD/yake

The lower the score, the more relevant the keyword is.

In [11]:
"""
To-do:

 [+] Add preprocessing that Michael created
 [+] Create a csv/json file that has ticket_id and tags assigned to it
 [+] Do analysis with one word only (not two)
 
"""

import yake
import json
import re
import operator
import csv

## Get data from csv and put it into the dictionary

In [12]:
csv_file_path = 'Data/tickets_clean.csv'
json_file_path = 'data/ticket_data.json'

data = []
with open(csv_file_path, encoding="utf8") as csv_file:
    csv_reader = csv.DictReader(csv_file)
    
    for rows in csv_reader:
        ticket = {}
        
        ticket["id"] = rows['id']
        ticket["title"] = rows['title']
        ticket["content"] = rows['content']
        
        data.append(ticket)
        
with open(json_file_path, 'w') as json_file:
    json_file.write(json.dumps(data))

## Function that implements YAKE!

In [13]:
def get_keywords(text):
    
    language = "en"
    max_ngram_size = 1
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 20

    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    
    return keywords

## Get keywords, create 2 files: file with ticket id and it's tags and file that has a set of teh most common tags

In [14]:
"""
for each ticket in the json file:
    - Combine title and content
    - Get tags
    - Add to the dict
"""
tags_dict = {}
tickets_with_tags = {}

for i in range(20000):
    text = data[i]['title'] + ' ' + data[i]['content']
    
    #print(text)
    try:
        tags_list = get_keywords(text)
    except:
        print('An error occurred.')
        continue
    #print(tags_list)
    
    tags_for_ticket = []
    for tag_value_pair in tags_list:
        tag = tag_value_pair[0]
        value = tag_value_pair[1]
         
        if value <= 0.1:
            tags_for_ticket.append(tag)
            
            if tag in tags_dict:
                tags_dict[tag] += 1
            else:
                tags_dict[tag] = 1
            
    
    tickets_with_tags[data[i]['id']] = tags_for_ticket

#print(tickets_with_tags)


In [15]:
for i in range(20000):
    text = data[i]['title'] + ' ' + data[i]['content']
    
    #print(text)
    try:
        tags_list = get_keywords(text)
    except:
        print('An error occurred.')
        continue
    #print(tags_list)
    
    tags_for_ticket = []
    for tag_value_pair in tags_list:
        tag = tag_value_pair[0]
        value = tag_value_pair[1]
         
        if value <= 0.1:
            tags_for_ticket.append(tag)
            
            if tag in tags_dict:
                tags_dict[tag] += 1
            else:
                tags_dict[tag] = 1
            
    
    tickets_with_tags[data[i]['id']] = tags_for_ticket

In [16]:
with open('YAKE_tickets-tags_one_word.json', 'w') as file:
    file.write(json.dumps(tickets_with_tags))

## Show all data 

In [17]:
sorted_list = sorted(tags_dict.items(), key=operator.itemgetter(1), reverse = True)

print("{:<35} {:<7}".format('tag', 'count'))

for k, v in sorted_list:
    print("{:<35} {:<7}".format(k, v))

tag                                 count  
firefox                             15730  
open                                4666   
page                                4134   
version                             3560   
work                                3272   
window                              3214   
tab                                 3024   
download                            2924   
problem                             2538   
update                              2320   
click                               2294   
time                                2060   
site                                1812   
upgrade                             1810   
bookmark                            1804   
file                                1714   
install                             1658   
browser                             1576   
bar                                 1560   
run                                 1468   
search                              1442   
load                            

dated                               6      
mile                                6      
destroy                             6      
publish                             6      
depend                              6      
reused                              6      
chug                                6      
mind                                6      
react                               6      
minus                               6      
extentions                          6      
hyper                               6      
joke                                6      
firesfox                            6      
television                          6      
rows                                6      
germany                             6      
ist                                 6      
hebrew                              6      
doesnot                             6      
transitional                        6      
dtd                                 6      
emac                            

installed.thanksyves                2      
elderly                             2      
tco                                 2      
thankslion                          2      
basiv                               2      
vulnerability                       2      
exploit                             2      
johndavisgallery.com                2      
page.                               2      
weak                                2      
prevous                             2      
kan                                 2      
premier                             2      
madden                              2      
nelson                              2      
pcwindows                           2      
ramquad                             2      
processorfirefox                    2      
visata                              2      
alltogether                         2      
succession                          2      
intentional                         2      
thermal                         

clusty.com                          2      
unlockable                          2      
now.fix                             2      
ditch                               2      
ie.right                            2      
can.thats                           2      
region.propertiesgoogle             2      
methodology                         2      
sixcore                             2      
help.thank                          2      
etrust                              2      
sophos                              2      
floyd                               2      
foolish                             2      
brill                               2      
optus                               2      
myzoo                               2      
days.firstly                        2      
shock                               2      
knology                             2      
savings.com                         2      
distiller                           2      
hibernate                       

individually                        2      
tiak                                2      
layoutunder                         2      
bookbag                             2      
webstie                             2      
bookbags                            2      
launcher                            2      
coat                                2      
ntlm                                2      
uris                                2      
installed.when                      2      
angel                               2      
risk                                2      
unemployment                        2      
previeous                           2      
uncategorized                       2      
clickability                        2      
fatal                               2      
tools.php                           2      
captia                              2      
intalled                            2      
globally                            2      
birthday                        

## Show the most useful tags only

In [18]:
with open('YAKE_ALL_tags_one_word.txt', 'w') as file:
    file.write(json.dumps(tags_dict))
    

usefull_tag_dict = {}

for key, value in tags_dict.items():
    if value > 10:
        usefull_tag_dict[key] = value

print(len(usefull_tag_dict))
        
with open('YAKE_tags_one_word.txt', 'w') as file:
    file.write(json.dumps(usefull_tag_dict))

1409


In [19]:
sorted_list = sorted(usefull_tag_dict.items(), key=operator.itemgetter(1), reverse = True)

print("{:<35} {:<7}".format('tag', 'count'))

for k, v in sorted_list:
    print("{:<35} {:<7}".format(k, v))

tag                                 count  
firefox                             15730  
open                                4666   
page                                4134   
version                             3560   
work                                3272   
window                              3214   
tab                                 3024   
download                            2924   
problem                             2538   
update                              2320   
click                               2294   
time                                2060   
site                                1812   
upgrade                             1810   
bookmark                            1804   
file                                1714   
install                             1658   
browser                             1576   
bar                                 1560   
run                                 1468   
search                              1442   
load                            

unsupported                         26     
bbc                                 26     
pointer                             26     
thx                                 26     
john                                26     
winxp                               26     
verizon                             26     
consume                             24     
messenger                           24     
regularly                           24     
reach                               24     
zynga                               24     
debian                              24     
pic                                 24     
hyperlink                           24     
pass                                24     
stuff                               24     
filemanager.js                      24     
daily                               24     
path                                24     
suite                               24     
snow                                24     
pron                            