# Data Preprocressing - Aspect Extraction

In [1]:
# pip install fasttext

In [2]:
# pip install stanfordnlp

In [3]:
import stanfordnlp
import pandas as pd
import numpy as np
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/lawrencecurran/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
df = pd.read_csv('../datasets/reviews.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,body,rating,product
0,0,Great wireless music,Product as expecred,5.0,Apple Airpods-Latest Model
1,1,battery life dies really quick,i really like my airpods but the battery life ...,4.0,Apple Airpods-Latest Model
2,2,The audio sound muffled (right ear),I bought Airpods 2 on Amazon. But right ears w...,3.0,Apple Airpods-Latest Model
3,3,Comprarlo sin problemas,Perfect,5.0,Apple Airpods-Latest Model
4,4,good！,so,5.0,Apple Airpods-Latest Model


In [6]:
# Making the body both the titleand the actual body of the review
df['body'] = df['title'] + ' ' + df['body'] 

In [7]:
# Using RegEx to 
df['body'] = df['body'].apply(lambda x: x.lower())
# remove punctuation from text
df['body'] = df['body'].str.replace('[^\w\s]','')

In [8]:
df.drop(columns=['Unnamed: 0', 'title'], inplace=True)

In [9]:
df.head()

Unnamed: 0,body,rating,product
0,great wireless music product as expecred,5.0,Apple Airpods-Latest Model
1,battery life dies really quick i really like m...,4.0,Apple Airpods-Latest Model
2,the audio sound muffled right ear i bought air...,3.0,Apple Airpods-Latest Model
3,comprarlo sin problemas perfect,5.0,Apple Airpods-Latest Model
4,good so,5.0,Apple Airpods-Latest Model


In [10]:
from collections import Counter

In [11]:
# Creating a dictionary of the most common words
word_dict = Counter(" ".join(df['body']).split(' ')).items()

In [12]:
# creating a dataframe with the entire corpus 
counter = pd.DataFrame(word_dict, columns=['word', 'count'])
counter.head()

Unnamed: 0,word,count
0,great,12568
1,wireless,3327
2,music,2949
3,product,9625
4,as,7832


In [13]:
# Looking into the different words that I plan to use as the aspects to draw sentiment analysis on
counter.groupby('count').max()

Unnamed: 0_level_0,word
count,Unnamed: 1_level_1
1,⁹
2,ıts
3,ı
4,único
5,yanking
...,...
29573,is
35205,to
41417,and
44047,i


In [14]:
counter.loc[counter['word']=='battery']

Unnamed: 0,word,count
6,battery,5034


In [15]:
counter.loc[counter['word']=='cancellation']

Unnamed: 0,word,count
721,cancellation,1301


In [16]:
counter.loc[counter['word']=='quality']

Unnamed: 0,word,count
64,quality,13184


In [17]:
counter.loc[counter['word']=='price']

Unnamed: 0,word,count
356,price,4709


In [18]:
counter.loc[counter['word']=='noise']

Unnamed: 0,word,count
539,noise,2996


In [19]:
counter.loc[counter['word']=='fit']

Unnamed: 0,word,count
717,fit,3300


In [20]:
counter.loc[counter['word']=='comfort']

Unnamed: 0,word,count
2027,comfort,310


In [21]:
counter.loc[counter['word']=='comfortable']

Unnamed: 0,word,count
435,comfortable,1573


In [22]:
counter.loc[counter['word']=='not']

Unnamed: 0,word,count
75,not,14806


In [23]:
# From 'a dash of data' - making sure the data is fully cleaned
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [24]:
df['body'] = df.body.apply(round1)

In [25]:
df.head()

Unnamed: 0,body,rating,product
0,great wireless music product as expecred,5.0,Apple Airpods-Latest Model
1,battery life dies really quick i really like m...,4.0,Apple Airpods-Latest Model
2,the audio sound muffled right ear i bought air...,3.0,Apple Airpods-Latest Model
3,comprarlo sin problemas perfect,5.0,Apple Airpods-Latest Model
4,good so,5.0,Apple Airpods-Latest Model


In [26]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [27]:
# Getting rid of numbers

In [28]:
df['body'] = df.body.apply(round2)

In [37]:
# Getting rid of the non-English words (there are a few spanish and other language reviews in the corpus)
words = set(nltk.corpus.words.words())

" ".join(w for w in nltk.wordpunct_tokenize(str(df['body'])) \
         if w.lower() in words or not w.isalpha())


'0 great wireless music product as 1 battery life really quick i really like m ... 2 the audio sound muffled right ear i bought air ... 3 sin perfect 4 good so ... 31863 good good 31864 amazing product an amazing product but a bit c ... 31865 not bad sound 31866 a good product the sound is good battery life ... 31867 average n name m writing ... Name : body , Length : 31868 , : object'

In [38]:
# Saving cleaned dataframe
df.to_csv('../datasets/df_clean.csv', index=False)

In [39]:
# Creating a document term matrix using CountVectorizer and eliminating 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
df_cv = cv.fit_transform(df.body)
df_dtm = pd.DataFrame(df_cv.toarray(), columns=cv.get_feature_names())
df_dtm.index = df.index
df_dtm

Unnamed: 0,aa,aaa,aaaaalmost,aaaandthe,aac,aacept,aachaar,aam,aata,aatisfied,...,ಇದ,ಇದನನ,ಚನನಗದ,ತಬ,ದಯವಟಟ,ಬಕ,ಬಲ,ಬಲಟತ,ಸಪರ,ಹಣಕಕ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Transposing to make it easier to do analysis on
df_dtm = df_dtm.transpose()
df_dtm.head(25)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31858,31859,31860,31861,31862,31863,31864,31865,31866,31867
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaaalmost,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaandthe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aacept,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aachaar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aam,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aata,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aatisfied,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
