# Epinions Text Classification

# Huy Mai

## Goal: To be able to classify with a high accuracy if a review is about a car or a camera and also to analyze what words are important in a car review and a camera review

In [1]:
import pandas as pd
import nltk
import string
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Naive Bayes Classifier

In [2]:
# import epinions.com csv file downloaded from http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/epinions.zip
# http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/
# the dataset contains 6000 reviews on either a car or a camera from epinions.com
# I believe this is data set is useful because 6000 reviews is a fairly large amount and these are real reviews from a review website

# use pandas to read in dataset
review = pd.read_csv('./epinions-1.csv')

In [3]:
# outputs some of the dataset's reviews 
review

Unnamed: 0,class,text
0,Auto,I have recently purchased a J30T with moderat...
1,Camera,This camera is perfect for anyone who wants t...
2,Auto,2000 Hyundai Elantra Wagon if you can find ...
3,Camera,I bought this product because I need instant ...
4,Camera,Before I begin my objective review I should ...
...,...,...
5995,Camera,Last week my Wife and I ran out of 35mm film...
5996,Auto,The major love of our family is to have time ...
5997,Camera,Purchased this product from an Ebay auction f...
5998,Auto,I have a 1990 Geo Metro The car runs great a...


In [4]:
# create two lists that contain all reviews and all labels (auto / camera)
review_list = list(review.text)
# have to use review['class'] since class is a reserved word in python
label_list = list(review['class'])

In [5]:
# preprocess text
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer= PorterStemmer()
# remove numbers
for i in range(len((review_list))):
        review_list[i] = re.sub(r'\d+', '', review_list[i])
# remove punctuation
for i in range(len((review_list))):
        review_list[i] = review_list[i].translate(str.maketrans('','', string.punctuation))
# tokenize reviews
review_list = [word_tokenize(m) for m in review_list]
# lowercase reviews
review_list = [[w.lower() for w in m] for m in review_list]
review_list[0]

['i',
 'have',
 'recently',
 'purchased',
 'a',
 'jt',
 'with',
 'moderate',
 'miles',
 'i',
 'shopped',
 'for',
 'a',
 'car',
 'that',
 'was',
 'looked',
 'after',
 'and',
 'paid',
 'more',
 'for',
 'it',
 'as',
 'a',
 'former',
 'auto',
 'technician',
 'i',
 'know',
 'what',
 'to',
 'look',
 'for',
 'and',
 'what',
 'to',
 'expect',
 'yet',
 'every',
 'time',
 'i',
 'drive',
 'this',
 'car',
 'i',
 'say',
 'to',
 'my',
 'self',
 'dam',
 'that',
 's',
 'a',
 'great',
 'car',
 'and',
 'yes',
 'i',
 'truly',
 'do',
 'the',
 'car',
 'affords',
 'a',
 'level',
 'of',
 'style',
 'and',
 'comfort',
 'that',
 'is',
 'beyond',
 'compare',
 'for',
 'the',
 'price',
 'i',
 'had',
 'looked',
 'at',
 'a',
 'number',
 'of',
 'j',
 's',
 'with',
 'some',
 'having',
 'covered',
 'k',
 'miles',
 'i',
 'checked',
 'them',
 'to',
 'see',
 'where',
 'the',
 'problems',
 'show',
 'they',
 'were',
 'great',
 'other',
 'than',
 'just',
 'basic',
 'owner',
 'neglect',
 'the',
 'car',
 'is',
 'extremely',
 '

In [6]:
# create a new list that removes the stop words in the reviews
no_stop_review_list = []
for m in review_list:
    stop_review = [w for w in m if w not in stop]
    no_stop_review_list.append(stop_review)

In [7]:
no_stop_review_list[0]

['recently',
 'purchased',
 'jt',
 'moderate',
 'miles',
 'shopped',
 'car',
 'looked',
 'paid',
 'former',
 'auto',
 'technician',
 'know',
 'look',
 'expect',
 'yet',
 'every',
 'time',
 'drive',
 'car',
 'say',
 'self',
 'dam',
 'great',
 'car',
 'yes',
 'truly',
 'car',
 'affords',
 'level',
 'style',
 'comfort',
 'beyond',
 'compare',
 'price',
 'looked',
 'number',
 'j',
 'covered',
 'k',
 'miles',
 'checked',
 'see',
 'problems',
 'show',
 'great',
 'basic',
 'owner',
 'neglect',
 'car',
 'extremely',
 'quiet',
 'smooth',
 'yet',
 'distance',
 'mild',
 'degree',
 'sportiness',
 'performance',
 'smooth',
 'yet',
 'impressive',
 'yet',
 'power',
 'well',
 'handled',
 'braking',
 'suspension',
 'systems',
 'similar',
 'refinement',
 'appointments',
 'available',
 'car',
 'somewhat',
 'sparse',
 'yet',
 'standard',
 'quality',
 'functionality',
 'apparent',
 'bose',
 'sound',
 'system',
 'example',
 'remarkable',
 'output',
 'clarity',
 'without',
 'fancy',
 'dials',
 'complaint',
 

In [8]:
#lemmatize and stem
# lemmatization
lem_review_list = []
for m in no_stop_review_list:
    lem_review = [lemmatizer.lemmatize(w) for w in m]
    lem_review_list.append(lem_review)

# stemming
stem_review_list = []
for m in lem_review_list:
    stem_review = [stemmer.stem(w) for w in m]
    stem_review_list.append(stem_review)
stem_review_list[0]

['recent',
 'purchas',
 'jt',
 'moder',
 'mile',
 'shop',
 'car',
 'look',
 'paid',
 'former',
 'auto',
 'technician',
 'know',
 'look',
 'expect',
 'yet',
 'everi',
 'time',
 'drive',
 'car',
 'say',
 'self',
 'dam',
 'great',
 'car',
 'ye',
 'truli',
 'car',
 'afford',
 'level',
 'style',
 'comfort',
 'beyond',
 'compar',
 'price',
 'look',
 'number',
 'j',
 'cover',
 'k',
 'mile',
 'check',
 'see',
 'problem',
 'show',
 'great',
 'basic',
 'owner',
 'neglect',
 'car',
 'extrem',
 'quiet',
 'smooth',
 'yet',
 'distanc',
 'mild',
 'degre',
 'sporti',
 'perform',
 'smooth',
 'yet',
 'impress',
 'yet',
 'power',
 'well',
 'handl',
 'brake',
 'suspens',
 'system',
 'similar',
 'refin',
 'appoint',
 'avail',
 'car',
 'somewhat',
 'spars',
 'yet',
 'standard',
 'qualiti',
 'function',
 'appar',
 'bose',
 'sound',
 'system',
 'exampl',
 'remark',
 'output',
 'clariti',
 'without',
 'fanci',
 'dial',
 'complaint',
 'k',
 'mile',
 'lack',
 'fold',
 'rear',
 'seat',
 'slightli',
 'high',
 'fue

In [9]:
# filter reviews by auto
auto_review = review.loc[review['class'] == 'Auto']

In [10]:
auto_review

Unnamed: 0,class,text
0,Auto,I have recently purchased a J30T with moderat...
2,Auto,2000 Hyundai Elantra Wagon if you can find ...
9,Auto,I have owned my Buick since 53000 km and I am...
13,Auto,ok this really isn t my vehicle it s my dad s...
17,Auto,I have had this car for over 4 years now I h...
...,...,...
5992,Auto,I won my Mustang GT in the Ford Mustang 35th ...
5993,Auto,I ve long considered reviewing my 1999 Honda...
5996,Auto,The major love of our family is to have time ...
5998,Auto,I have a 1990 Geo Metro The car runs great a...


In [11]:
# create a list of auto reviews
auto_review_list = list(auto_review.text)

In [12]:
# concatenate all words of the auto reviews together
auto_review_words = ' '.join(auto_review_list)

In [13]:
# preprocess auto words
# remove numbers
auto_review_words = re.sub(r'\d+', '', auto_review_words)
# remove punctuation
auto_review_words = auto_review_words.translate(str.maketrans('','', string.punctuation))
# lowercase
auto_review_words = auto_review_words.lower()
# tokenize
all_auto_tokens = word_tokenize(auto_review_words)
# remove stop words
all_auto_tokens = [w for w in all_auto_tokens if w not in stop]
# lemmatize
all_auto_lems = [lemmatizer.lemmatize(w) for w in all_auto_tokens]
# stem
all_auto_stems = [stemmer.stem(w) for w in all_auto_lems]

In [14]:
# create a frequency distibution of words of all auto reviews
# word_features = top 100 most common words of auto reviews
all_auto_words = nltk.FreqDist(w for w in all_auto_stems)
word_features = list(all_auto_words)[:100]

In [15]:
word_features

['car',
 'drive',
 'seat',
 'like',
 'get',
 'one',
 'look',
 'engin',
 'would',
 'vehicl',
 'back',
 'well',
 'good',
 'also',
 'year',
 'time',
 'power',
 'new',
 'problem',
 'mile',
 'go',
 'br',
 'great',
 'use',
 'even',
 'rear',
 'much',
 'make',
 'comfort',
 'quot',
 'front',
 'wheel',
 'feel',
 'littl',
 'road',
 'want',
 'thing',
 'door',
 'truck',
 'realli',
 'first',
 'control',
 'handl',
 'model',
 'need',
 'interior',
 'still',
 'ride',
 'lot',
 'ga',
 'price',
 'two',
 'buy',
 'system',
 'come',
 'driver',
 'love',
 'v',
 'better',
 'take',
 'nice',
 'could',
 'room',
 'bought',
 'speed',
 'around',
 'b',
 'work',
 'way',
 'got',
 'ford',
 'featur',
 'think',
 'peopl',
 'suv',
 'brake',
 'side',
 'light',
 'replac',
 'option',
 'purchas',
 'dealer',
 'turn',
 'never',
 'transmiss',
 'put',
 'sinc',
 'honda',
 'window',
 'know',
 'mani',
 'long',
 'though',
 'mileag',
 'small',
 'sound',
 'seem',
 'tire',
 'enough',
 'say']

In [16]:
# feature extractor if the top 100 words of auto reviews are in the total reviews
def features(document):
    document_words = set(document)
    features = {}
    for word in document:
        features['contains({})'.format(word)] = (word in word_features)
    #features['has_number()'] = any(word.isdigit() for word in document_words)
    return features

In [17]:
features(stem_review_list[9])

{'contains(own)': False,
 'contains(buick)': False,
 'contains(sinc)': True,
 'contains(km)': False,
 'contains(approach)': False,
 'contains(must)': False,
 'contains(say)': True,
 'contains(nicest)': False,
 'contains(car)': True,
 'contains(driven)': False,
 'contains(previous)': False,
 'contains(grand)': False,
 'contains(compar)': False,
 'contains(also)': True,
 'contains(test)': False,
 'contains(malibu)': False,
 'contains(rent)': False,
 'contains(lincoln)': False,
 'contains(continent)': False,
 'contains(think)': True,
 'contains(almost)': False,
 'contains(vehicl)': True,
 'contains(quit)': False,
 'contains(pleasantli)': False,
 'contains(surpris)': False,
 'contains(basic)': False,
 'contains(featur)': True,
 'contains(model)': True,
 'contains(indic)': False,
 'contains(light)': True,
 'contains(low)': False,
 'contains(fuel)': False,
 'contains(washer)': False,
 'contains(fluid)': False,
 'contains(chang)': False,
 'contains(engin)': True,
 'contains(oil)': False,
 'co

In [18]:
# create a list that match the review with the auto/camera label
review_list = list(zip(stem_review_list, label_list))

In [19]:
review_list[0]

(['recent',
  'purchas',
  'jt',
  'moder',
  'mile',
  'shop',
  'car',
  'look',
  'paid',
  'former',
  'auto',
  'technician',
  'know',
  'look',
  'expect',
  'yet',
  'everi',
  'time',
  'drive',
  'car',
  'say',
  'self',
  'dam',
  'great',
  'car',
  'ye',
  'truli',
  'car',
  'afford',
  'level',
  'style',
  'comfort',
  'beyond',
  'compar',
  'price',
  'look',
  'number',
  'j',
  'cover',
  'k',
  'mile',
  'check',
  'see',
  'problem',
  'show',
  'great',
  'basic',
  'owner',
  'neglect',
  'car',
  'extrem',
  'quiet',
  'smooth',
  'yet',
  'distanc',
  'mild',
  'degre',
  'sporti',
  'perform',
  'smooth',
  'yet',
  'impress',
  'yet',
  'power',
  'well',
  'handl',
  'brake',
  'suspens',
  'system',
  'similar',
  'refin',
  'appoint',
  'avail',
  'car',
  'somewhat',
  'spars',
  'yet',
  'standard',
  'qualiti',
  'function',
  'appar',
  'bose',
  'sound',
  'system',
  'exampl',
  'remark',
  'output',
  'clariti',
  'without',
  'fanci',
  'dial',
 

In [20]:
# shuffle the list and use the feature extractor on all reviews
random.shuffle(review_list)
featuresets = [(features(review), label) for (review,label) in review_list]

In [21]:
# create training set (80% of reviews) and test set (20% of reviews)
train_reviews = featuresets[:int(len(featuresets) * 0.8)]
test_reviews = featuresets[int(len(featuresets) * 0.8 + 1):]

In [22]:
train_reviews[0]

({'contains(agfa)': False,
  'contains(ephoto)': False,
  'contains(best)': False,
  'contains(decent)': False,
  'contains(buy)': True,
  'contains(accept)': False,
  'contains(qualiti)': False,
  'contains(also)': True,
  'contains(adjust)': False,
  'contains(camera)': False,
  'contains(store)': False,
  'contains(le)': False,
  'contains(pictur)': False,
  'contains(bit)': False,
  'contains(annoy)': False,
  'contains(like)': True,
  'contains(option)': True,
  'contains(though)': True,
  'contains(sometim)': False,
  'contains(crystal)': False,
  'contains(clear)': False,
  'contains(time)': True,
  'contains(flash)': False,
  'contains(help)': False,
  'contains(group)': False,
  'contains(self)': False,
  'contains(photo)': False,
  'contains(lcd)': False,
  'contains(liquid)': False,
  'contains(display)': False,
  'contains(take)': True,
  'contains(larg)': False,
  'contains(downsid)': False,
  'contains(eat)': False,
  'contains(batteri)': False,
  'contains(crazi)': False

In [23]:
# use naive bayes classifier on training set
classifier = nltk.NaiveBayesClassifier.train(train_reviews)

In [24]:
# print the accuracy of the naive bayes classifier on the test set
print(nltk.classify.accuracy(classifier, test_reviews))

# accuracy is around 99-100%
# ~ 5995 / 6000 reviews classified correct

1.0


In [25]:
classifier.show_most_informative_features(10)

Most Informative Features
        contains(camera) = False          Camera : Auto   =    437.5 : 1.0
    contains(photograph) = False          Camera : Auto   =    409.8 : 1.0
            contains(pc) = False          Camera : Auto   =    310.4 : 1.0
           contains(suv) = True             Auto : Camera =    277.9 : 1.0
          contains(film) = False          Camera : Auto   =    274.2 : 1.0
        contains(toyota) = False            Auto : Camera =    273.3 : 1.0
         contains(honda) = True             Auto : Camera =    270.0 : 1.0
       contains(shutter) = False          Camera : Auto   =    262.9 : 1.0
        contains(vehicl) = True             Auto : Camera =    228.8 : 1.0
        contains(camera) = None             Auto : Camera =    226.6 : 1.0


# Semantic analysis

In [26]:
# I will use the TF - IDF algorithm for the semantic analysis

In [27]:
## TF - IDF - to find what are the most important words in each review

# reviewlist
review_list = list(review.text)
# label list
labels = list(review['class'])

vocabulary = set()
for doc in review_list:
    vocabulary.update(doc.split())

vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}

tfidf = TfidfVectorizer(vocabulary=vocabulary)

tfidf.fit(review_list)
tfidf.transform(review_list)

# will zip important_word and the label to a list
word_label_tuple = []
#stores word with the highest score for the corresponding review
important_word = []

for doc in review_list:
    score={}
    X = tfidf.transform([doc])
    for word in doc.split():
        score[word] = X[0, tfidf.vocabulary_[word]]
    sortedscore = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    # append the highest score to important_word
    important_word.append(sortedscore[0][0])
# zip important_word and labels to list
word_label_tuple = list(zip(important_word, labels))

  "Upper case characters found in"


In [28]:
# create a important word - Auto tuple list
word_auto_tuple = [(w, l) for (w, l) in word_label_tuple if l == 'Auto']

In [29]:
# make list of the important words seperately
auto_tuple_words = [''.join(w) for (w, l) in word_auto_tuple]
# remove stop words from the list
auto_tuple_words = [w for w in auto_tuple_words if w not in stop]

In [30]:
# create frequency distribution of words for auto_tuple_words
auto_words_freq = nltk.FreqDist(w for w in auto_tuple_words)
# top 100 important words for auto reviews
top_auto_words = list(auto_words_freq)[:100]
top_auto_words
# numbers are the model years, wheels of the car (146s, 1992, etc)
# words relate to type of vehicle, car brands, parts of cars (sedan, saturn, tires, etc)

['car',
 'truck',
 'van',
 'quot',
 'vehicle',
 'jeep',
 'mustang',
 '000',
 'cab',
 'neon',
 '146s',
 'blazer',
 'cavalier',
 'amp',
 '911',
 'bed',
 'wagon',
 'eclipse',
 'saturn',
 '1994',
 '626',
 'suburban',
 'convertible',
 'oil',
 '1992',
 'mpg',
 'college',
 '4x4',
 'roadster',
 'diesel',
 'minivans',
 'winter',
 'sedan',
 'caravan',
 'tires',
 'definately',
 'civic',
 'trooper',
 'prius',
 'gasket',
 'maxima',
 'repair',
 'domestic',
 'pickup',
 'station',
 'probe',
 'wrangler',
 '2001',
 'minivan',
 'ford',
 'quattro',
 'corvette',
 'escort',
 'cup',
 'freeway',
 'bumper',
 '85',
 'hummer',
 'belts',
 'mercedes',
 'drink',
 'fuel',
 'conversion',
 'husband',
 'vitara',
 'dont',
 'haul',
 '133',
 '300',
 '1997',
 'cars',
 'subaru',
 'accident',
 'trucks',
 'contour',
 'navigator',
 'explorer',
 'firebird',
 'seat',
 'transmission',
 'chevy',
 'showroom',
 'miles',
 'villager',
 '146t',
 'windstar',
 'mpv',
 'replaced',
 'yet',
 'company',
 'cav',
 'dunes',
 '35th',
 'honestly'

In [31]:
# create a important word - Camera tuple list
word_camera_tuple = [(w, l) for (w, l) in word_label_tuple if l == 'Camera']

In [32]:
# make list of the important words seperately
camera_tuple_words = [''.join(w) for (w, l) in word_camera_tuple]
# remove stop words from the list
camera_tuple_words = [w for w in camera_tuple_words if w not in stop]

In [33]:
# create frequency distribution of words for camera_tuple_words
camera_words_freq = nltk.FreqDist(w for w in camera_tuple_words)
# top 100 important words for camera reviews
top_camera_words = list(camera_words_freq)[:100]
top_camera_words
# numbers are the model numbers of the camera (d30, 880, etc)
# words relate to cameras, photos, file formats, computer parts, software

['camera',
 'quot',
 'pictures',
 'pics',
 'content',
 'href',
 'floppy',
 '146s',
 'photos',
 'images',
 '950',
 '640',
 '780c',
 'cam',
 'flash',
 'mp3',
 '149',
 'amp',
 '880',
 'compression',
 '280',
 '6800',
 'firmware',
 '3000',
 'unit',
 'video',
 '4700',
 'zoom',
 '990',
 'pix',
 '2040',
 'camara',
 '1680',
 'disk',
 'picture',
 'photo',
 '260',
 'tk',
 'webcam',
 '210',
 'beep',
 '775',
 '2400',
 'mm',
 'digital',
 'mavica',
 'jewelry',
 '5400',
 'ebay',
 'smart',
 'u30',
 'watch',
 'chip',
 '1700',
 'kodak',
 's110',
 'shooting',
 '00',
 '3030',
 'volts',
 'sent',
 'batteries',
 '2100',
 'lenses',
 'shipped',
 'disks',
 'gives',
 'sec',
 '5m',
 '4500',
 'staff',
 'upload',
 '10',
 'megabyte',
 'software',
 'docking',
 '460',
 'mc3',
 'mega',
 'meg',
 'com',
 '700',
 'movies',
 'clips',
 '800',
 '2020',
 'memory',
 'flashcard',
 'casio',
 '151',
 'diskette',
 'reader',
 '1800',
 '171',
 '730',
 'resolution',
 'film',
 'mail',
 'd30',
 'lens']