In [38]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

#We are going to use XML file to read the reviews of a restaurant.
xml_path = './NLP/ABSA15_RestaurantsTrain/ABSA-15_Restaurants_Train_Final.xml'

def parse_data(xml_path):
    container = []                                              
    reviews = ET.parse(xml_path).getroot()                      
    
    #getting the root of the XML file. Every review has 4-5 sentences which are it's childrens. We will seperate all the
    #sentences to form 1849 different reviews. Two columns : sentence and it's sentiment (polarity).
    
    for review in reviews:  
        sentences = review.getchildren()[0].getchildren()       
        for sentence in sentences:                                  
            sentence_text = sentence.getchildren()[0].text          
            
            try:                                                     
                opinions = sentence.getchildren()[1].getchildren()
            
                for opinion in opinions:    
                    #we are interested only in the sentence and it's polarity.
                    #category,target,sentence id and rest everything is not important
                    polarity = opinion.attrib["polarity"]
        
                    row = {"sentence": sentence_text, "sentiment":polarity}   
                    container.append(row)                                                              
                
            except IndexError: 
                row = {"sentence": sentence_text}        
                container.append(row) 
                
    #convert the container (corpus having all reviews) into a dataframe and return.  
    return pd.DataFrame(container)

#call the function parse_data.Storing everything in df(dataframe)
df = parse_data(xml_path)

#printing the head of the dataframe i.e first 50 reviews
df.head(50)

  app.launch_new_instance()


Unnamed: 0,sentence,sentiment
0,Judging from previous posts this used to be a ...,negative
1,"We, there were four of us, arrived at noon - t...",negative
2,"They never brought us complimentary noodles, i...",negative
3,The food was lousy - too sweet or too salty an...,negative
4,The food was lousy - too sweet or too salty an...,negative
5,"After all that, they complained to me about th...",negative
6,Avoid this place!,negative
7,"I have eaten at Saul, many times, the food is ...",positive
8,Saul is the best restaurant on Smith Street an...,positive
9,The duck confit is always amazing and the foie...,positive


In [39]:
#removing the 195 dataframes where no sentiment is expressed
df.isnull().sum()

sentence       0
sentiment    195
dtype: int64

In [40]:
print ("Original:", df.shape)

#remove duplicate reviews
dd = df.drop_duplicates()
dd = dd.reset_index(drop=True)
print ("Drop Dupicates:", dd.shape)

#removing the 195 dataframes where no sentiment is expressed
dd_dn = dd.dropna()
df = dd_dn.reset_index(drop=True)
print ("Drop Nulls:", df.shape)

Original: (1849, 2)
Drop Dupicates: (1396, 2)
Drop Nulls: (1201, 2)


In [41]:
#17th review for example
df.sentence[17]

'Went on a 3 day oyster binge, with Fish bringing up the closing, and I am so glad this was the place it O trip ended, because it was so great!'

In [42]:
#natural language toolkit (nltk)
import nltk
# nltk.download('punkt')

#using tokenize library for tokenization. Pre-processing starts
from nltk.tokenize import word_tokenize
tokens = word_tokenize(df.sentence[17])
print(tokens)

['Went', 'on', 'a', '3', 'day', 'oyster', 'binge', ',', 'with', 'Fish', 'bringing', 'up', 'the', 'closing', ',', 'and', 'I', 'am', 'so', 'glad', 'this', 'was', 'the', 'place', 'it', 'O', 'trip', 'ended', ',', 'because', 'it', 'was', 'so', 'great', '!']


In [43]:
#removing stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print ([i for i in tokens if i not in stop_words])


['Went', '3', 'day', 'oyster', 'binge', ',', 'Fish', 'bringing', 'closing', ',', 'I', 'glad', 'place', 'O', 'trip', 'ended', ',', 'great', '!']


In [44]:
#another example
df.sentence[24]

"And I hate to say this but I doubt I'll ever go back. "

In [45]:
#one of the normalizing steps is to convert everything to lowercase.
lower_case = df.sentence[24].lower()
lower_case

"and i hate to say this but i doubt i'll ever go back. "

In [46]:
#defining the apostrophe for english language which will handle special cases.
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

#split function of python. can be used for tokenization
words = lower_case.split()
reformed = [appos[word] if word in appos else word for word in words]
reformed = " ".join(reformed) 
reformed

# i'll gets converted to i will

'and i hate to say this but i doubt i will ever go back.'

In [47]:
#displaying the tokens for a particular sentence
tokens

['Went',
 'on',
 'a',
 '3',
 'day',
 'oyster',
 'binge',
 ',',
 'with',
 'Fish',
 'bringing',
 'up',
 'the',
 'closing',
 ',',
 'and',
 'I',
 'am',
 'so',
 'glad',
 'this',
 'was',
 'the',
 'place',
 'it',
 'O',
 'trip',
 'ended',
 ',',
 'because',
 'it',
 'was',
 'so',
 'great',
 '!']

In [48]:
#removing the alphanumeric characters like numbers which do not hold significant importance
words = [word for word in tokens if word.isalpha()]
words

['Went',
 'on',
 'a',
 'day',
 'oyster',
 'binge',
 'with',
 'Fish',
 'bringing',
 'up',
 'the',
 'closing',
 'and',
 'I',
 'am',
 'so',
 'glad',
 'this',
 'was',
 'the',
 'place',
 'it',
 'O',
 'trip',
 'ended',
 'because',
 'it',
 'was',
 'so',
 'great']

In [49]:
#an example sentence
df.sentence[17]

'Went on a 3 day oyster binge, with Fish bringing up the closing, and I am so glad this was the place it O trip ended, because it was so great!'

In [50]:
# converting the sentiment to numeric values by using map function
# 2 -> positive , 1-> neutral, 0-> negative
df['sentiment'] = df.sentiment.map(lambda x: int(2) if x =='positive' else int(0) if x =='negative' else int(1) if x == 'neutral' else np.nan)
print (df['sentiment'].value_counts())
df.head(15)

2    834
0    317
1     50
Name: sentiment, dtype: int64


Unnamed: 0,sentence,sentiment
0,Judging from previous posts this used to be a ...,0
1,"We, there were four of us, arrived at noon - t...",0
2,"They never brought us complimentary noodles, i...",0
3,The food was lousy - too sweet or too salty an...,0
4,"After all that, they complained to me about th...",0
5,Avoid this place!,0
6,"I have eaten at Saul, many times, the food is ...",2
7,Saul is the best restaurant on Smith Street an...,2
8,The duck confit is always amazing and the foie...,2
9,The wine list is interesting and has many good...,2


In [51]:
#Lemmatization which is the last step in data pre processing
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#plural words get converted into singular form
#better gets converted into good. pos="a" implies that we are referring to an adjective
# running gets converted into run (Stemming)
#pos="v" implies that we are referring to a verb

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti")) #cacti is plural form of cactus
print(lemmatizer.lemmatize("geese")) 
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("running",'v'))

cat
cactus
goose
rock
python
good
run
run
