# POS taggers NLTK and StanfordNLP

## NLTK

### Read in Data

In [1]:
import pandas as pd
import re
import string
import nltk
# pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

data = pd.read_csv("../../../dataset/bamboo.csv")
# data.columns = ['title', 'description']

In [2]:
data.head()

Unnamed: 0,issuekey,title,description,storypoint
0,BAM-65,Allows CVS repo to timeout and report on locki...,"Sometimes, when you perform a CVS action you g...",3
1,BAM-932,allow a build to be placed at the head of the ...,,2
2,BAM-3476,Result not saved when checkout fails,Happened yesterday on our hosted bamboo: https...,2
3,BAM-3612,Talkback from elastic agent to bamboo server t...,When an elastic instance/agent is configured t...,2
4,BAM-3616,"Users can see the maven module (groupid, artif...",* Should respect the sub-working directory. On...,5


### Create function to remove punctuation, tokenize, remove stopwords, and stem
### Create function to calculate the processing time

In [3]:
import time

def clean_text(text):
    try:
        text = "".join([word.lower() for word in text if word not in string.punctuation])
        tokens = re.split('\W+', text)
        # text = [ps.stem(word) for word in tokens if word not in stopwords]
        text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
        
        trial = ""
        for t in text:
            trial = trial + t + " "
        text = nltk.word_tokenize(trial)
        new_text = nltk.pos_tag(text)

        return new_text
    except:
        return ""

def clean_text_time(text):
    start_time = time.time()
    clean_text(text)
    return time.time() - start_time

In [4]:
data['title_clean'] = data['title'].apply(lambda x: clean_text(x))
data['title_clean_time'] = data['title'].apply(lambda x: clean_text_time(x))

data['description_clean'] = data['description'].apply(lambda x: clean_text(x))
data['description_clean_time'] = data['description'].apply(lambda x: clean_text_time(x))

data.head()

Unnamed: 0,issuekey,title,description,storypoint,title_clean,title_clean_time,description_clean,description_clean_time
0,BAM-65,Allows CVS repo to timeout and report on locki...,"Sometimes, when you perform a CVS action you g...",3,"[(allows, NNS), (cv, VBP), (repo, VB), (timeou...",0.001,"[(sometimes, RB), (perform, VB), (cv, JJ), (ac...",0.001994
1,BAM-932,allow a build to be placed at the head of the ...,,2,"[(allow, JJ), (build, NN), (placed, VBN), (hea...",0.000995,,0.0
2,BAM-3476,Result not saved when checkout fails,Happened yesterday on our hosted bamboo: https...,2,"[(result, NN), (saved, VBD), (checkout, NN), (...",0.0,"[(happened, VBN), (yesterday, NN), (hosted, VB...",0.028921
3,BAM-3612,Talkback from elastic agent to bamboo server t...,When an elastic instance/agent is configured t...,2,"[(talkback, NN), (elastic, JJ), (agent, NN), (...",0.001005,"[(elastic, JJ), (instanceagent, NN), (configur...",0.001995
4,BAM-3616,"Users can see the maven module (groupid, artif...",* Should respect the sub-working directory. On...,5,"[(user, NN), (see, VB), (maven, JJ), (module, ...",0.000992,"[(respect, NN), (subworking, VBG), (directory,...",0.001996


In [5]:
start_time = time.time()
text = nltk.word_tokenize(data["description"][0])
print(nltk.pos_tag(text))
print("This line took", time.time() - start_time, "to process")

[('Sometimes', 'RB'), (',', ','), ('when', 'WRB'), ('you', 'PRP'), ('perform', 'VBP'), ('a', 'DT'), ('CVS', 'NNP'), ('action', 'NN'), ('you', 'PRP'), ('get', 'VBP'), ('something', 'NN'), ('like', 'IN'), ('{', '('), ('noformat', 'JJ'), ('}', ')'), ('cvs', 'JJ'), ('update', 'JJ'), (':', ':'), ('[', 'JJ'), ('01:38:32', 'CD'), (']', 'NN'), ('waiting', 'VBG'), ('for', 'IN'), ('mchai', 'NN'), ("'s", 'POS'), ('lock', 'NN'), ('in', 'IN'), ('/cvsroot/atlassian/maven2test/bamboo', 'NNP'), ('{', '('), ('noformat', 'JJ'), ('}', ')'), ('so', 'IN'), ('Bamboo', 'NNP'), ('would', 'MD'), ('probably', 'RB'), ('just', 'RB'), ('hang', 'NN'), ('and', 'CC'), ('become', 'VB'), ('not', 'RB'), ('so', 'RB'), ('happy', 'JJ'), ('.', '.'), ('We', 'PRP'), ('should', 'MD'), ('allow', 'VB'), ('Bamboo', 'NNP'), ('to', 'TO'), ('timeout', 'VB'), (',', ','), ('or', 'CC'), ('conditionally', 'RB'), ('stop', 'JJ'), ('and', 'CC'), ('tell', 'VB'), ('the', 'DT'), ('user', 'NN'), ('how', 'WRB'), ('to', 'TO'), ('dix', 'VB'), ('t

### Process all files and add to one dataframe

In [6]:
import os
print(os.listdir("../../../dataset/"))

['appceleratorstudio.csv', 'aptanastudio.csv', 'bamboo.csv', 'clover.csv', 'datamanagement.csv', 'duracloud.csv', 'jirasoftware.csv', 'mesos.csv', 'moodle.csv', 'mule.csv', 'mulestudio.csv', 'springxd.csv', 'talenddataquality.csv', 'talendesb.csv', 'titanium.csv', 'usergrid.csv']


In [7]:
def process_csv(file_path):
    start_time = time.time()
    data = pd.read_csv("../../../dataset/" + file_path)
    data['title_clean'] = data['title'].apply(lambda x: clean_text(x))
    data['title_clean_time'] = data['title'].apply(lambda x: clean_text_time(x))

    data['description_clean'] = data['description'].apply(lambda x: clean_text(x))
    data['description_clean_time'] = data['description'].apply(lambda x: clean_text_time(x))

    print(file_path, "took", time.time() - start_time, "to process")
    return data

In [11]:
df = pd.DataFrame(columns=['issuekey', 'title', 'description', 'storypoint', 'title_clean',
       'title_clean_time', 'description_clean', 'description_clean_time'])
for file_path in os.listdir("../../../dataset/"):
    df = df.append(process_csv(file_path), ignore_index=True)

appceleratorstudio.csv took 28.64676570892334 to process
aptanastudio.csv took 8.057582139968872 to process
bamboo.csv took 4.083427429199219 to process
clover.csv took 3.4450652599334717 to process
datamanagement.csv took 27.13540005683899 to process
duracloud.csv took 4.266796112060547 to process
jirasoftware.csv took 2.698770761489868 to process
mesos.csv took 26.03762984275818 to process
moodle.csv took 8.585415124893188 to process
mule.csv took 6.176243543624878 to process
mulestudio.csv took 6.331482410430908 to process
springxd.csv took 24.67861533164978 to process
talenddataquality.csv took 10.21130084991455 to process
talendesb.csv took 7.940899848937988 to process
titanium.csv took 28.877580642700195 to process
usergrid.csv took 3.1930084228515625 to process


In [12]:
df.head()

Unnamed: 0,issuekey,title,description,storypoint,title_clean,title_clean_time,description_clean,description_clean_time
0,TISTUD-6,Add CA against object literals in function inv...,{html}<div><p>The idea here is that if our met...,1,"[(add, NN), (ca, MD), (object, VB), (literal, ...",0.00103,"[(htmldivpthe, NN), (idea, NN), (metadata, NN)...",0.001
1,TISTUD-9,Update branding for Appcelerator plugin to App...,"{html}<div><p>At least fix feature icons, asso...",1,"[(update, JJ), (branding, NN), (appcelerator, ...",0.000998,"[(htmldivpat, NN), (least, JJS), (fix, JJ), (f...",0.00196
2,TISTUD-11,Create new JSON schema for SDK team,{html}<div><p>Create JSON schema containing pr...,1,"[(create, VB), (new, JJ), (json, NN), (schema,...",0.000999,"[(htmldivpcreate, NN), (json, NN), (schema, NN...",0.001994
3,TISTUD-13,Create Project References Property Page,{html}<div><p>Create property page for project...,1,"[(create, NN), (project, NN), (reference, NN),...",0.0,"[(htmldivpcreate, NN), (property, NN), (page, ...",0.002027
4,TISTUD-16,New Desktop Project Wizard,{html}<div><p>Desktop (need to convert existin...,1,"[(new, JJ), (desktop, NN), (project, NN), (wiz...",0.000995,"[(htmldivpdesktop, JJ), (need, NN), (convert, ...",0.002003


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23313 entries, 0 to 23312
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   issuekey                23313 non-null  object 
 1   title                   23313 non-null  object 
 2   description             21082 non-null  object 
 3   storypoint              23313 non-null  object 
 4   title_clean             23313 non-null  object 
 5   title_clean_time        23313 non-null  float64
 6   description_clean       23313 non-null  object 
 7   description_clean_time  23313 non-null  float64
dtypes: float64(2), object(6)
memory usage: 1.4+ MB
