In [171]:
import string
import nltk
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [82]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/heshamnawaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/heshamnawaz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/heshamnawaz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [83]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [84]:
lemmatizer = WordNetLemmatizer()

In [85]:
def preprocess(text):
	# Steps:
	# 1. lowercase
	# 2. Lammetize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
	# 3. Remove stop words.
	# 4. Remove punctuations.
	# 5. Remove character with the length size of 1.

	lowered = str.lower(text)

	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(lowered)

	words = []
	for w in word_tokens:
		if w not in stop_words:
			if w not in string.punctuation:
				if len(w) > 1:
					lemmatized = lemmatizer.lemmatize(w)
					words.append(lemmatized)

	return words

In [86]:
def calculate_jaccard(word_tokens1, word_tokens2):
	# Combine both tokens to find union.
	both_tokens = word_tokens1 + word_tokens2
	union = set(both_tokens)

	# Calculate intersection.
	intersection = set()
	for w in word_tokens1:
		if w in word_tokens2:
			intersection.add(w)

	jaccard_score = len(intersection)/len(union)
	return jaccard_score

In [87]:
reviews_1000_movies = pd.read_csv("reviews_1000_movies.csv")
reviews_1000_movies['processed_reviews'] = reviews_1000_movies['review_content'].apply(preprocess)

In [90]:
# For each review, calculate similarity with all other reviews, i < j
# Room for optimization (half the time) by not recalculating for already calculated pairs

In [121]:
def get_most_representative_review(reviews):
    '''
    reviews is a pandas dataframe
    '''
    similarity_dict = {review: [] for review in reviews.index}
    for i in range(len(reviews)):
        review1 = reviews.iloc[i]['processed_reviews']
        for j in range(len(reviews)):
            review2 = reviews.iloc[j]['processed_reviews']
            similarity_dict[reviews.index[i]].append(calculate_jaccard(review1, review2))
    
    average_similarity_dict = {k:np.sum(v)/len(v) for k,v in similarity_dict.items()}
    max_index = max(average_similarity_dict, key=average_similarity_dict.get)
    most_representative_review = reviews.loc[max_index]
    return most_representative_review


In [None]:
reviews0814255 = reviews_1000_movies[reviews_1000_movies['rotten_tomatoes_link'] == 'm/0814255']

In [92]:
get_most_representative_review(reviews0814255)

rotten_tomatoes_link                                            m/0814255
critic_name                                              Bill Goodykoontz
top_critic                                                           True
publisher_name                                           Arizona Republic
review_type                                                         Fresh
review_score                                                        3.5/5
review_date                                                    2010-02-10
review_content          Percy Jackson isn't a great movie, but it's a ...
processed_reviews       [percy, jackson, n't, great, movie, 's, good, ...
Name: 7, dtype: object

In [170]:
representative_reviews = []
for unique_id in reviews_1000_movies['rotten_tomatoes_link'].unique():
    reviews = reviews_1000_movies[reviews_1000_movies['rotten_tomatoes_link'] == unique_id]
    reviews['review_length'] = reviews.processed_reviews.apply(len)
    representative_reviews.append(get_most_representative_review(reviews[reviews['review_length'] != 0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['review_length'] = reviews.processed_reviews.apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['review_length'] = reviews.processed_reviews.apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['review_length'] = reviews.processed_reviews.apply(len)
A value is tr

In [172]:
pd.concat(representative_reviews, axis=1).T

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,processed_reviews,review_length
7,m/0814255,Bill Goodykoontz,True,Arizona Republic,Fresh,3.5/5,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...","[percy, jackson, n't, great, movie, 's, good, ...",17
247,m/0878835,Moira MacDonald,True,Seattle Times,Fresh,4/4,2010-06-18,Nicole Holofcener's lovely Please Give is a sm...,"[nicole, holofcener, 's, lovely, please, give,...",14
296,m/10,Roger Ebert,True,Chicago Sun-Times,Fresh,4/4,2004-10-23,Blake Edwards's 10 is perhaps the first comedy...,"[blake, edward, 's, 10, perhaps, first, comedy...",16
330,m/1000013-12_angry_men,Matthew Pejkovic,False,Matt's Movie Reviews,Fresh,4/5,2010-07-06,A brilliant courtroom drama whose strength lie...,"[brilliant, courtroom, drama, whose, strength,...",17
359,m/1000079-20000_leagues_under_the_sea,David Cornelius,False,eFilmCritic.com,Fresh,5/5,2006-07-03,"This is one of Disney's best, and one of the a...","[one, disney, 's, best, one, all-time, adventu...",10
...,...,...,...,...,...,...,...,...,...,...
67395,m/211,Noel Murray,True,Los Angeles Times,Rotten,,2018-06-07,Even the subset of Nicolas Cage fans who like ...,"[even, subset, nicolas, cage, fan, like, actor...",17
67513,m/21_and_over,Jennifer Heaton,False,Alternative Lens,Rotten,5.5/10,2018-11-01,It's like The Hangover...but in college. And t...,"['s, like, hangover, ..., college, 's]",6
67634,m/21_grams,Jeffrey Bruner,False,Des Moines Register,Fresh,4/5,2003-12-26,"A mind-bending, scrambled egg of a drama, 21 G...","[mind-bending, scrambled, egg, drama, 21, gram...",12
67880,m/21_jump_street_2011,R.L. Shaffer,False,IGN DVD,Fresh,8/10,2012-06-27,"Slick and funny as hell, 21 Jump Street is one...","[slick, funny, hell, 21, jump, street, one, be...",10
