Take the provided dataset, Tokenize, Vectorize and Train an NB Model for both Lie Dection and Sentiment. Find the top 20 most powerful Tokens and compare and contrast if a computer can learn lie dectection similar to what's been previously observed in sentiment.  

In [1]:
# Step 1 load in the data
import pandas as pd
data = pd.read_csv("deception_data_converted_final.csv", usecols=range(3), lineterminator="\n")
#review the data loaded corrected
data.head()

Unnamed: 0,lie,sentiment,review
0,f,n,'Mike\'s Pizza High Point
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,'After I went shopping with some of my friend
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...


In [11]:
#Checking the balance of sentiment tags
data['sentiment'].value_counts()

n    46
p    46
Name: sentiment, dtype: int64

In [2]:

data['review'] = data['review'].str.lower() #convert to lower case
data['review'] = data['review'].apply(lambda x: x.replace("\\", '')) # removes \
data['review'] #previews the results

0                              'mike's pizza high point
1     'i really like this buffet restaurant in marsh...
2         'after i went shopping with some of my friend
3     'olive oil garden was very disappointing. i ex...
4     'the seven heaven restaurant was never known f...
                            ...                        
87    'pastablities is a locally owned restaurant in...
88    'i like the pizza at dominoes for their specia...
89    'it was a really amazing japanese restaurant. ...
90    'how do i even pick a best experience at joe's...
91    'my sister and i ate at this restaurant called...
Name: review, Length: 92, dtype: object

In [3]:
#Tokenize
import nltk
data['review'] = data['review'].apply(lambda x: nltk.word_tokenize(x))
data['review']

0                       ['mike, 's, pizza, high, point]
1     [', i, really, like, this, buffet, restaurant,...
2     ['after, i, went, shopping, with, some, of, my...
3     ['olive, oil, garden, was, very, disappointing...
4     ['the, seven, heaven, restaurant, was, never, ...
                            ...                        
87    ['pastablities, is, a, locally, owned, restaur...
88    [', i, like, the, pizza, at, dominoes, for, th...
89    ['it, was, a, really, amazing, japanese, resta...
90    ['how, do, i, even, pick, a, best, experience,...
91    ['my, sister, and, i, ate, at, this, restauran...
Name: review, Length: 92, dtype: object

In [4]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
data['lemmaReviews'] = data['review'].apply(lambda x: [lemma.lemmatize(word) for word in x])

In [5]:
data['lemmaReviews']

0                       ['mike, 's, pizza, high, point]
1     [', i, really, like, this, buffet, restaurant,...
2     ['after, i, went, shopping, with, some, of, my...
3     ['olive, oil, garden, wa, very, disappointing,...
4     ['the, seven, heaven, restaurant, wa, never, k...
                            ...                        
87    ['pastablities, is, a, locally, owned, restaur...
88    [', i, like, the, pizza, at, domino, for, thei...
89    ['it, wa, a, really, amazing, japanese, restau...
90    ['how, do, i, even, pick, a, best, experience,...
91    ['my, sister, and, i, ate, at, this, restauran...
Name: lemmaReviews, Length: 92, dtype: object

In [6]:
from nltk.corpus import stopwords
rWords = stopwords.words('english')
rWords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
# Stopwords to remove
negative =['ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't",]
len(negative)

35

In [8]:
#Removing Negation words from Stopword List
print("Current Stop Word List: ", len(rWords))
for word in negative:
    rWords.remove(word)
print("Updated Stop Words: ", len(rWords))

Current Stop Word List:  179
Updated Stop Words:  144


In [10]:
#Checking the balance of lie tags
data['lie'].value_counts()

f    46
t    46
Name: lie, dtype: int64

In [12]:
# Reimport the dataframe, TfidfVectorizer will apply the tokenization for me.
data = pd.read_csv('deception_data_converted_final.tsv', delimiter='\t',quoting=3)
data['review'] = data['review'].str.lower() #convert to lower case
data['review'] = data['review'].apply(lambda x: x.replace("\\", '')) # removes \
data = data.sample(n = len(data['lie']),random_state=24) #Randome shuffle of the data through sampling
data['review'] #previews the results

48    'in my favorite restaurant yuenan restaurant. ...
86    'blue monkey cafe is my favorite japanese rest...
37    'the food was not bad, but the place was all n...
20    'usually, i use yelp to find restaurant. the y...
72    'stronghearts cafe is the best! the owners hav...
                            ...                        
17    'i had heard that panera bread is a good place...
87    'pastablities is a locally owned restaurant in...
64    'gannon’s isle ice cream served the best ice c...
3     'olive oil garden was very disappointing. i ex...
34    'i once went to a restaurant, which was not ve...
Name: review, Length: 92, dtype: object

In [13]:
data.head() #check that the sentiment and lie are sorted

Unnamed: 0,lie,sentiment,review
48,f,p,'in my favorite restaurant yuenan restaurant. ...
86,t,p,'blue monkey cafe is my favorite japanese rest...
37,t,n,"'the food was not bad, but the place was all n..."
20,f,n,"'usually, i use yelp to find restaurant. the y..."
72,t,p,'stronghearts cafe is the best! the owners hav...


In [14]:
#Found on Github as sample to incorporate both
#nltk.word_tokenizer and Lemmatization for call in Vectorizer
#git location: https://gist.github.com/4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55

class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`',"!","?", "'", "#","@"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in nltk.word_tokenize(doc) if t not in self.ignore_tokens]

Tokenizer = LemmaTokenizer()

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(
    tokenizer=Tokenizer, #applies Tokenization
    stop_words=rWords, #removes stopword tokens
    ngram_range=(1,2) #creats unigrams, bigrams, and trigrams  
)
X_train = vectorize.fit_transform(data['review'])
y_sent = data['sentiment']
y_lie = data['lie']



In [16]:
#View Sparse Matrix
pd.DataFrame(X_train.toarray(), columns=vectorize.get_feature_names_out())

Unnamed: 0,#,# winning,$,$ 100,$ 5,$ 6,$ 7,%,% 's,% love,...,yelp find,yelp free,yelp would,york,york city,york known,yuenan,yuenan restaurant,’,’ isle
0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.219666,0.219666,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.097599,0.0,0.097599,0.08362,0.089421,0.0,0.000000,0.000000,0.000000,0.000000
4,0.119662,0.119662,0.097006,0.0,0.119662,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
88,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
89,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.111212,0.111212
90,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [17]:
#import the packages to build the model
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [18]:
#Build the model
#Cross Validation by 10 folds
model = MultinomialNB()
lie_pred = cross_val_predict(model, X_train, y_lie, cv=10)
sent_pred = cross_val_predict(model, X_train, y_sent, cv=10)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_lie,lie_pred))

              precision    recall  f1-score   support

           f       0.55      0.59      0.57        46
           t       0.56      0.52      0.54        46

    accuracy                           0.55        92
   macro avg       0.55      0.55      0.55        92
weighted avg       0.55      0.55      0.55        92



In [20]:
print(classification_report(y_sent, sent_pred))

              precision    recall  f1-score   support

           n       0.82      0.80      0.81        46
           p       0.81      0.83      0.82        46

    accuracy                           0.82        92
   macro avg       0.82      0.82      0.82        92
weighted avg       0.82      0.82      0.82        92



In [21]:
lieModel = model.fit(X_train, y_lie)
sentModel = model.fit(X_train, y_sent)

In [22]:
def show_most_informative_features(vector, lm, n=20):
    feature_names = vector.get_feature_names()
    coefs_with_fns = sorted(zip(lm.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n+1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [26]:
show_most_informative_features(vectorize, lieModel)

	-8.5241	$ 100          		-7.5129	restaurant     
	-8.5241	$ 6            		-7.5227	great          
	-8.5241	$ 7            		-7.5593	best           
	-8.5241	% 's           		-7.5680	food           
	-8.5241	% service      		-7.5740	wa             
	-8.5241	&              		-7.8170	amazing        
	-8.5241	& pop          		-7.8205	good           
	-8.5241	& restaurant   		-7.8904	's             
	-8.5241	'after         		-7.8941	place          
	-8.5241	'after reading 		-7.9047	nice           
	-8.5241	'after went    		-7.9104	service        
	-8.5241	'bill          		-7.9179	fresh          
	-8.5241	'bill gate     		-7.9212	friendly       
	-8.5241	'brown         		-7.9241	friend         
	-8.5241	'brown tofu    		-7.9364	always         
	-8.5241	'd 'more       		-7.9390	price          
	-8.5241	'friday        		-7.9726	like           
	-8.5241	'friday worse  		-7.9751	need           
	-8.5241	'in diner      		-7.9848	quality        
	-8.5241	'just          		-7.9878	delicious      




In [24]:
show_most_informative_features(vectorize, sentModel)

	-8.5241	$ 100          		-7.5129	restaurant     
	-8.5241	$ 6            		-7.5227	great          
	-8.5241	$ 7            		-7.5593	best           
	-8.5241	% 's           		-7.5680	food           
	-8.5241	% service      		-7.5740	wa             
	-8.5241	&              		-7.8170	amazing        
	-8.5241	& pop          		-7.8205	good           
	-8.5241	& restaurant   		-7.8904	's             
	-8.5241	'after         		-7.8941	place          
	-8.5241	'after reading 		-7.9047	nice           
	-8.5241	'after went    		-7.9104	service        
	-8.5241	'bill          		-7.9179	fresh          
	-8.5241	'bill gate     		-7.9212	friendly       
	-8.5241	'brown         		-7.9241	friend         
	-8.5241	'brown tofu    		-7.9364	always         
	-8.5241	'd 'more       		-7.9390	price          
	-8.5241	'friday        		-7.9726	like           
	-8.5241	'friday worse  		-7.9751	need           
	-8.5241	'in diner      		-7.9848	quality        
	-8.5241	'just          		-7.9878	delicious      
