In [1]:
import pandas as pd
import numpy as np
import os
import re
import sys
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import sys

In [2]:
#Import comment files and store them in one dataframe
df_train = pd.read_csv('../train_df_full.csv')
df_test = pd.read_csv('../test_df_full.csv')
df_full = pd.concat([df_train, df_test], axis=0)

In [3]:
#Text Preprocessing Functions
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

def preprocess_model_reviews(review):
    
    review = REPLACE_NO_SPACE.sub(NO_SPACE, review.lower())
    review = REPLACE_WITH_SPACE.sub(SPACE, review)
    
    return review

In [4]:
#Build CountVectorizer and Train Logistic Regression Model
reviews_train = list(df_train['content']) 
reviews_test = list(df_test['content'])
    
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)

X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.80)
    
train_model = LogisticRegression(C=0.05)
train_model.fit(X_train, y_train)
print ("Validation Accuracy of model is: %s" % (accuracy_score(y_val, train_model.predict(X_val))))

model = LogisticRegression(C=0.05)
model.fit(X, target)
print ("Test Accuracy of model is: %s" % accuracy_score(target, model.predict(X_test)))



Validation Accuracy of model is: 0.8868
Test Accuracy of model is: 0.88128


In [5]:
#Function to calculate sentiment score for a given movie from all its comments
def get_sentiment_score(movie_id):
    review_list = list(df_train.loc[df_train['movie_id']==str(movie_id), 'content'].values)
    
    review_list_clean = []
    for review in review_list:
        review_list_clean.append(preprocess_model_reviews(review))
        
    review_array = cv.transform(review_list_clean)
    
    predictions = model.predict(review_array)
    
    try:
        sentiment_score = np.count_nonzero(predictions)/np.count_nonzero(predictions==0)
    except:
        print('There were no negative reviews, returning number of positive reviews!')
        sentiment_score = np.count_nonzero(predictions)
    
    print('The Sentiment Score is: ', sentiment_score)
    
    return (movie_id, sentiment_score)

In [6]:
#Reorder recommended movies by sentiment score
def reorder_recommended_movies(recommended_movie_id_list):
    sentiment_ranking_list = []
    
    for movie_id in recommended_movie_id_list:
        sentiment_ranking_list.append(get_sentiment_score(movie_id))
    
    sentiment_ranking_list.sort(key = lambda x: x[1], reverse=True)
    
    return sentiment_ranking_list

In [7]:
#Test run
recommended = ['tt0100680', 'tt0453418', 'tt0177606', 'tt0074223', 'tt0042042']
reorder_recommended_movies(recommended)

The Sentiment Score is:  5.666666666666667
There were no negative reviews, returning number of positive reviews!
The Sentiment Score is:  3
There were no negative reviews, returning number of positive reviews!
The Sentiment Score is:  5
There were no negative reviews, returning number of positive reviews!
The Sentiment Score is:  9
There were no negative reviews, returning number of positive reviews!
The Sentiment Score is:  6


[('tt0074223', 9),
 ('tt0042042', 6),
 ('tt0100680', 5.666666666666667),
 ('tt0177606', 5),
 ('tt0453418', 3)]