In [7]:

 
# ---------------------------------------------------------------------------------------
# CS 175, WINTER 2021: ASSIGNMENT 1
#
# The goal of this assignment is to give you some practice with manipulating text data, 
# including tokenization and creation of a sparse bag-of-words array for a set of documents,
# as well as building a simple logistic classifier and looking at which words get large
# positive and negative weights on a dataset of 20,000 reviews from Yelp
# 
# You should install Anaconda with Python 3.7 or above before starting this assignment
#
# General notes
#	- do not remove any code that is already provided: add your own code where appropriate
#	- add comments inline in the text to explain what you are doing 
#	- feel free to add error checking if you wish (but we will not grade you on this) 
#	- when you are done submit a copy of your edited version of this file, as Assignment1.py
#   - be sure to test your code on some simple examples before you submit it
#
# Grading
#   - problems 1 through 6 are each worth 20 points
#   - points will be deducted if 
# 		- the code does not execute 
#       - the code does not return the correct answers on simple test cases, 
#		- if the code is not general and only works on special cases, 
#		- if there are very few or no comments.
# 
# Submission
#	- your edited copy of assignment1.py
#	- a text file called assignment1.txt with the output from run_assignment1.py
#	- submit both files to Canvas
# ---------------------------------------------------------------------------------------


# NOTE: for this assignment you will need to import the following libraries/modules
# All of these should be installed on your system if you have the latest version of Anaconda installed
import nltk 
from nltk import word_tokenize
import simplejson as jsoncm
import sklearn
from sklearn.feature_extraction.text import * 
from sklearn.model_selection import train_test_split 

from sklearn import linear_model 
from sklearn import metrics

import numpy as np
import matplotlib.pyplot as plt

# ---------------------------------------------------------------------------------------
# PROBLEM 1
# Complete the definition of the function below so that it can take in a string
# and return the percentage of alphabetical characters in a string that match a particular
# alphabetical letter, where matching is not case-sensitive. The percentage is computed
# relative to the number of alphabetic characters in the string (so numbers, punctuation,
# white space, and all other non-alphabetic characters are ignored).
#
# NOTE: please read Section 3 of Chapter 1 in the NLTK online book to understand how to use the FreqDist method 
# ---------------------------------------------------------------------------------------
def letter_percentage(text,letter):
	"""
	Parameters:
	text: string
	letter: a single alphabetical character (in lower case, e.g., 'a', 'b', ...)
	
	Returns:
	percentage: the percentage of alphabetical characters in <text> that match <letter>
		  where a match is defined irrespective of the case of the characters in <text>
				
	Example:
	letter_percentage('This is a cat.','t')  returns 20.0
	"""

     # extract a list of alphabetic characters and convert to lower case
	charlist = [alpha for alpha in text.lower() if alpha.isalnum()]

     # create an fdist object for the list of lower case characters
	fdist = nltk.probability.FreqDist(charlist)
 
     # calculate the frequency of the specific character "letter"
	frequency = fdist[letter]/len(charlist)
 
     # convert the frequency to a percentage
	character_percent = 100*frequency
	p = '{0:.2f}'.format(character_percent)
	print('\nPercentage = ',p,' of characters match the character',letter)
	return character_percent


# ---------------------------------------------------------------------------------------
# PROBLEM 2
# Complete the definition of the function below so that it can take as input either
# (a) a string or (b) a list of tokens of type nltk.text.Text
# convert the string to word tokens, run the NLTK part of speech parser on the word tokens
# using the 'universal' tagset, print out to the screen the percentage of tokens in the
# that correspond to each type of tag, and return a list of pairs of tokens and tags.
# 
# NOTE:
# 	- please read Section 1 of Chapter 5 in the NLTK online book for information about part of speech tagging
# 	- for word tokenization use the NLTK word_tokenize function with default settings
#  	- Print out the tags in order of decreasing frequency of occurrence
#	- Percentages printed out should be formatted to 2 decimal places of precision
# ---------------------------------------------------------------------------------------
def parts_of_speech(s):
	"""
	Parameters:
	s: input text as a string 
	
	Returns:
	The list of tokens and their POS tags from the string s, as a list of sublists 
	Prints out the total number of tokens and percentage of tokens with each tag 
				
	Example:
	s = 'This is a sentence. And this is a second sentence! Cool.'
	z1, z2 = parts_of_speech(s)  
		Total number of tokens is 14
		Tag: DET           Percentage of tokens =  28.57
		Tag: .             Percentage of tokens =  21.43
		Tag: NOUN          Percentage of tokens =  21.43
		....
	""" 
	 
	# tokenize the string into word tokens
	tokens = word_tokenize(s)
	
    # extract POS tags using the universal tagset with the NLTK POS tagger
	tokens_and_tags = nltk.pos_tag(tokens,tagset = 'universal')
		
	# Compute and print the total number of tokens  
	n = len(tokens)
	print('Total number of tokens is',n)
	
	# extract the tags
	tags = [ item[1] for item in tokens_and_tags ]  
	
	# count how often each of the tags occurs using FreqDist (from NLTK)
	tag_counts = nltk.probability.FreqDist(tags)
 
     # sort the tag counts by frequency (using one of FreqDist's built in methods)
	sorted_tag_counts = tag_counts.most_common()
	
     # print out each tag and the percentage of tokens associated with it, in descending order 
	for item in sorted_tag_counts:
		tag_percent = 100 * item[1]/n
		p = '{0:.2f}'.format(tag_percent)
		print('Tag:',item[0],'\t   Percentage of tokens = ', p )
  
	return( tokens_and_tags  )
	
	

# ---------------------------------------------------------------------------------------
# PROBLEM 3
# Complete the definition of the function below so that it 
# - reads input from the specified filename, e.g., yelp_reviews.json (using json.load)
# - extracts the text of the kth review
# - runs the parts_of_speech function (Problem 2) to compute the percentages of tokens for each part of speech
# ---------------------------------------------------------------------------------------
def review_pos(k,filename): 
	 
	print('\nLoading the file: \n', filename) 
	with open(filename, 'r') as jfile:
		data = jsoncm.load(jfile)
	print('\nTotal number of reviews extracted =', len(data))
 
	print('\nComputing the percentages for each part-of-speech for review',k)
	
	d = data[k-1]  # extract the kth review (indexed from 0)
	s = d['text']  # extract text string associated with kth review
	print('Text from review ',k, ' is:')
	print(s)
	parts_of_speech(s)
		

# ---------------------------------------------------------------------------------------
# PROBLEM 4
# Create a bag of words (BOW) representation from text documents, using the Vectorizer function in scikit-learn
#
# The inputs are 
#  - a filename (you will use "yelp_reviews.json") containing the reviews in JSON format 
#  - the min_pos and max_neg parameters
#  - we label all reviews with scores > min_pos = 4 as "1"  
#  - we label all reviews with scores < max_neg = 2 as "0" 
#  - this creates a simple set of labels for binary classification, ignoring the neutral (score = 3) reviews
# 
#  The function extracts the text and scores for each review from the JSON data
#  It then tokenizes and creates a sparse bag-of-words array using scikit-learn vectorizer function
#  The number of rows in the array is the number of reviews with scores <=2 or >=4
#  The number of columns in the array is the number of terms in the vocabulary
#
#  NOTE: 
#  - please read the scikit-learn tutorial on text feature extraction before you start this problem:
#     https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction  
#  - in this function we will use scikit-learn tokenization (rather than NLTK)
# ---------------------------------------------------------------------------------------
def create_bow_from_reviews(filename, min_pos=4, max_neg=2): 
	
	print('\nLoading the file: \n', filename) 
	with open(filename, 'r') as jfile:
		data = jsoncm.load(jfile)
	print('\nTotal number of reviews extracted =', len(data) )

	text = []
	Y = []
	lengths = []
	print('\nExtracting tokens from each review.....(can be slow for a large number of reviews)......')   
	for d in data: 	# can substitute data[0:9] here if you want to test this function on just a few reviews
		review = d['text']    # keep only the text and label
		stars = d['stars']
        #....write some simple logic to generate a binary score for each review
		if stars >= min_pos:
			score = 1
			text.append(review)
			Y.append(score)
		elif stars <= max_neg:
			score = 0
			text.append(review)
			Y.append(score)

    
    # create an instance of a CountVectorizer, using 
    # (1) the standard 'english' stopword set 
    # (2) only keeping terms in the vocabulary that occur in at least 1% of documents
    # (3) allowing both unigrams and bigrams in the vocabulary (use "ngram_range=(1,2)" to do this)
	vectorizer = CountVectorizer(stop_words='english',min_df=0.01,ngram_range=(1,2))
	
	# create a sparse BOW array from 'text' using vectorizer  
	X = vectorizer.fit_transform(text)
	
	# an alternative above would be to use TfIDF rather than counts - which is very simple to do (but not needed here)
 
	print('Data shape: ', X.shape)
	
	# you can uncomment this next line if you want to see the full list of tokens in the vocabulary  
	#print('Vocabulary: ', vectorizer.get_feature_names())
 
	return X, Y, vectorizer
		 
		 
		 
# ---------------------------------------------------------------------------------------
# PROBLEM 5
#  Separate an X,Y dataset (X=features, Y=labels) into training and test subsets
#  Build a logistic classifier on the training subset
#  Evaluate performance on the test subset  
#
#  NOTE: before starting this problem please read the scikit-learn documentation on logistic classifiers:
#		https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
# ---------------------------------------------------------------------------------------		 
def logistic_classification(X, Y, test_fraction):
    # should add comments defining what the inputs are what the function does

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    print('Number of training examples: ', X_train.shape[0])
    print('Number of testing examples: ', X_test.shape[0])
    print('Vocabulary size: ', X_train.shape[1])

    # Specify the logistic classifier model with an l2 penalty for regularization and with fit_intercept turned on
    classifier = linear_model.LogisticRegression(penalty='l2', fit_intercept=True)

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('\nTraining a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    train_predictions = classifier.predict(X_train)
    train_accuracy = classifier.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_predictions = classifier.predict(X_test)
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    class_probabilities = classifier.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1])
    print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)
    

# ---------------------------------------------------------------------------------------
# PROBLEM 6
#   Takes as input
#     (1) a scikit-learn trained logistic regression classifier (e.g., trained in Problem 5) 
#     (2) a scikit-learn vectorizer object that produced the BOW features for the classifier
#   and prints out and returns
#   - the K terms in the vocabulary tokens with the largest positive weights  
#   - the K terms in the vocabulary with the largest negative weights 
#
# To write this code you will need to read the documentation for the logistic regression model 
# in scikit-learn to figure out how to access the classifier's weights and corresponding tokens
# ---------------------------------------------------------------------------------------				

def most_significant_terms(classifier, vectorizer, K):
	terms = vectorizer.get_feature_names() #term
	weight = classifier.coef_[0] #weight
	pair = list(zip(terms,weight))
	pair.sort(key=lambda x:x[1],reverse=True)

     # You can write this function in whatever way you want
     
     # ...after you find the relevant weights and terms....     
     # ....cycle through the positive weights, in the order of largest weight first and print out
     # K lines where each line contains 
     # (a) the term corresponding to the weight (a string)
     # (b) the weight value itself (a scalar printed to 3 decimal places)
	topK_pos_weights = [i[1] for i in pair[:K]]
	topK_pos_terms = [i[0] for i in pair[:K]]
	topK_neg_weights = [i[1] for i in pair[-K:]]
	topK_neg_terms = [i[0] for i in pair[-K:]]
	topK_neg_terms.reverse()
	topK_neg_weights.reverse()
# e.g., for w in topK_pos_weights:
#      	...
#      	print( ....
#      	
     # Same for negative weights, most negative values first
#      for w in topK_neg_weights:
#      	...
#      	print( ....
	print("top ",K," words token withe the positive weight:")
	for i in range(len(topK_pos_weights)):
		print(topK_pos_terms[i] ," : ", topK_pos_weights[i])
	print("\ntop ",K," words token withe the positive weight:")
	for i in range(len(topK_neg_weights)):
		print(topK_neg_terms[i], " : ", topK_neg_weights[i])



	return(topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms)



In [8]:
letter_percentage('This is a sentence about a large dog.','t') 
# test the parts_of_speech function with simple input
parts_of_speech('This is a very simple test sentence to test the part of speech function in NLTK.')


# load yelp reviews and compute percentages of parts of speech for the Kth review
K = 2
review_pos(K,'yelp_reviews.json') 

# read in the review text and tokenize the text in each review
X, Y , vectorizer_BOW = create_bow_from_reviews('yelp_reviews.json')   


# run a logistic classifier on the reviews, specifying the fraction to be used for testing  
test_fraction = 0.8
logistic_classifier = logistic_classification(X, Y,test_fraction)  


# print out and return the most significant positive and negative weights (and associated terms) 
most_significant_terms(logistic_classifier, vectorizer_BOW, K=10)


Percentage =  10.34  of characters match the character t
Total number of tokens is 17
Tag: NOUN 	   Percentage of tokens =  35.29
Tag: DET 	   Percentage of tokens =  17.65
Tag: VERB 	   Percentage of tokens =  11.76
Tag: ADP 	   Percentage of tokens =  11.76
Tag: ADV 	   Percentage of tokens =  5.88
Tag: ADJ 	   Percentage of tokens =  5.88
Tag: PRT 	   Percentage of tokens =  5.88
Tag: . 	   Percentage of tokens =  5.88

Loading the file: 
 yelp_reviews.json

Total number of reviews extracted = 20000

Computing the percentages for each part-of-speech for review 2
Text from review  2  is:
If you need an inexpensive place to stay for a night or two then you may consider this place but for a longer stay I'd recommend somewhere with better amenities. 

Pros:
Great location- you're right by the train station, central location to get to old town and new town, and right by sight seeing his tours. Food, bars, and shopping all within walking distance. Location, location, location.
Very clean

([2.114320916794638,
  2.0337550111611553,
  1.8554659278672017,
  1.8489143944834998,
  1.691340223125593,
  1.5373060421314768,
  1.5139529722362448,
  1.5118529862421104,
  1.447075356955066,
  1.3667725492218181],
 [-2.469812488279568,
  -2.090375478490818,
  -1.8383668628682437,
  -1.7899077954520253,
  -1.7780969145833994,
  -1.617362442432416,
  -1.5325835676573403,
  -1.4893455503955593,
  -1.4307321070268648,
  -1.3917856100701127],
 ['amazing',
  'awesome',
  'delicious',
  'excellent',
  'friendly',
  'great',
  'fantastic',
  'fast',
  'definitely',
  'world'],
 ['worst',
  'horrible',
  'maybe',
  'dirty',
  'ok',
  'terrible',
  'bland',
  'slow',
  'rude',
  'stay'])