In [3]:
# Some test on how to conduct pre-processing (test some NLP technics)

import MySQLdb
import pandas
import re
import itertools
import nltk
from bs4 import BeautifulSoup

In [4]:
def rm_cite(raw_text):
    lst = []
    iscite = False
    lines = raw_text.splitlines()
    for idx,line in enumerate(lines):
        if line.startswith("(In reply to comment"):
            iscite = True
            lst.append(idx)
        elif line.startswith(">"):
            if iscite == True:
                lst.append(idx)
        else:
            iscite = False
        
    return '\n'.join([item[1] for item in filter(lambda x: x[0] not in lst, enumerate(lines))])
    

In [5]:
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup

def preprocess_strict( raw_description, output):
    # Function to convert a raw bug description to a list of words or a cleared string, it's "strict" in the sense that it removes all non-letters 
    # Remove markers
    # text = BeautifulSoup(raw_description).get_text() 
    
    # Remove cites
    description = rm_cite(raw_description)
    
    # Remove urls
    text = re.sub("((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)", " ", description)
    
    # Remove non-letters        
    # letters_only = re.sub("[^a-zA-Z_/\-\.]", " ", description_text)
    letters_only = re.sub("[^a-zA-Z\.]", " ", text)
    letters_only = re.sub("\.(?!((c|h|cpp|py)\s+$))", " ", letters_only)
    
    # Convert to lower case, tokenize
    words = [word for sent in nltk.sent_tokenize(letters_only.lower()) for word in nltk.word_tokenize(sent)]
    
    # Remove stop words
    stopwords = set(nltk.corpus.stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stopwords]
    
    # Stemming
    snowball = SnowballStemmer("english")
    stems = [snowball.stem(w) for w in meaningful_words]
    
    if output == "list":
        return stems
    else:
        return " ".join(stems)

In [5]:
def tokenize_and_stem( raw_description ):
    text = BeautifulSoup(raw_description).get_text()
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    meaningful_tokens = [token for token in filtered_tokens if not token in stopwords]
    snowball = SnowballStemmer("english")
    stems = [snowball.stem(t) for t in filtered_tokens]
    return stems

In [6]:
def tokenizestem_to_string( raw_description ):
    
    return " ".join(tokenize_and_stem(raw_description))

In [7]:
conn_data = MySQLdb.connect(host='10.117.8.41', port=3306, user='root', passwd='vmware', db='bugdata')
cur_data =conn_data.cursor()

sql_ld = '''SELECT bug_id, thetext
FROM longdescs
WHERE bug_id = 943195'''
    
df = pandas.io.sql.read_sql(sql_ld, conn_data)

for idx, row in df.iterrows():
    print idx,'\n', "*****", '\n', row['thetext']

0 
***** 
SR ID: SR  12224870309
Customer: IBM / GMT (+1 DST) / cool
Product Version/Build Numbers: VMware Tools 9.0.0 build-782409 
Regressed from previous release?: NA
Frequency: repruducible 
Time of Failure/Last Occurrence: NA
Time Skew between various machines?: NA
Recent Changes to the Environment? Upgrade to 5.1 GA
Date Opened in Support: 2012-09-24
Guest: Ubuntu 10.04 (2.6.32-24-generic #39-Ubuntu SMP)
Server IP/Hostname: NA
Problem description: Snapshot task for a Linux VM fails with

"An error occurred while quiescing the virtual machine. See the virtual machine's event log for details."

According to the customer, this only happens for 5.1, not for an 5.0 or 4.x Tools (which makes sense, because as far as I can see, the "new" freeze is only in the most recent Tools),

Impact: IBM is concerned that their customers will have issues with their backup software, which by default sets the quiesce flag.
Which VMs in questions have problems: .ova attached to the PR
Guest OS VMX File

In [49]:
for line in df.loc[3]['thetext'].splitlines():
    print line,"***"

# print df.loc[3]['thetext']
# print rm_cite(df.loc[3]['thetext'])
print df.loc[0]['thetext']
# print tokenizestem_to_string(df.loc[0]['thetext'])
print preprocess_strict(df.loc[0]['thetext'], "string")

(In reply to comment #0) ***
> Which VMs in questions have problems: .ova attached to the PR ***
> Guest OS VMX File Path: .ova attached to the PR ***
 ***
What are the login credentials (root/???) for the attached .ova VM? ***
SR ID: SR  12224870309
Customer: IBM / GMT (+1 DST) / cool
Product Version/Build Numbers: VMware Tools 9.0.0 build-782409 
Regressed from previous release?: NA
Frequency: repruducible 
Time of Failure/Last Occurrence: NA
Time Skew between various machines?: NA
Recent Changes to the Environment? Upgrade to 5.1 GA
Date Opened in Support: 2012-09-24
Guest: Ubuntu 10.04 (2.6.32-24-generic #39-Ubuntu SMP)
Server IP/Hostname: NA
Problem description: Snapshot task for a Linux VM fails with

"An error occurred while quiescing the virtual machine. See the virtual machine's event log for details."

According to the customer, this only happens for 5.1, not for an 5.0 or 4.x Tools (which makes sense, because as far as I can see, the "new" freeze is only in the most recent T

In [6]:
def preprocess_mild( raw_description):
    # Function to convert a raw bug description to a list of words or a cleared string, it's "mild" in the sense that it removes words that don't contain letters
    # Remove markers
    # text = BeautifulSoup(raw_description).get_text() 
    
    # Remove cites
    description = rm_cite(raw_description)
    print description
    
    # Remove urls
    text = re.sub("((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)", " ", description)
    text = re.sub("[^\x00-\x7f]", " ", text)
    
    # Convert to lower case, tokenize
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    
    # Remove stop words
    stopwords = set(nltk.corpus.stopwords.words('english'))
    meaningful_tokens = [token for token in filtered_tokens if not token in stopwords]
    print " ".join(meaningful_tokens)
    
    # Stemming
    snowball = SnowballStemmer("english")
    stems = [snowball.stem(t) for t in meaningful_tokens]
    print " ".join(stems)

In [10]:
conn_data = MySQLdb.connect(host='10.117.8.41', port=3306, user='root', passwd='vmware', db='bugdata')
cur_data =conn_data.cursor()

sql_ld = '''SELECT thetext
FROM longdescs
WHERE bug_id = 794510'''
cur_data.execute(sql_ld)
text = cur_data.fetchall()
for item in text:
    print '''"*****"'''
    print item[0]
    preprocess_mild(item[0])

"*****"
ESX Information:
----------------
Type : Thin
Build : ESX 5.0U1 536624
Test Machine : cpdbj-252.eng.vmware.com(10.117.9.133)
Login Credentials : root/ca$h...
Guest VM: windows8-64
Client OS: windowsxp sp2 32bit


Test case details:
----
1. Create windows8 Virtual machine and install windows8 developer preview build.
   xHCI USB controller is added by default.
2. Install VMware tools.
3. Connected USB3.0 devices to client OS.
   The devices are Kinston DT ultimate USB3.0 flash driver and buslink USB3.0 hard driver.
   Both devices works well for read/write sanity test on physical machine.
4. Client connect the USB3.0 devices to the guest.
   ==> failed
       And the devices also get disconnected from the client OS.

Expected result: USB3 devices should be OK to add to the VM.
Actual result: USB3 devices cannot be connected to VM.

Captured Logs and their location:
---
Will put usbarb log and vi-client log from client OS to the ftp-access.
ESX Information:
----------------
Type 