In [23]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from random import shuffle
import pandas as pd
import pickle
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [16]:
def loadingDataset():
    data = pd.read_csv('Movie Dataset.csv').sample(n=1000)
    data.dropna(inplace=True)
    return data

In [5]:
eng_stop_words = stopwords.words('english')
symbol = punctuation
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

In [6]:
def get_label(tag):
    if tag == 'jj':
        return 'a'
    elif tag in ['nn','rb','vb']:
        return tag[0]
    else:
        return None

In [7]:
def lemmatizing(word_list):
    lemmatized_list = []
    tagging = pos_tag(word_list)
    for word, tag in tagging:
        label = get_label(tag)
        if label!=None:
            lemmatized_list.append(lemmatizer.lemmatize(word,label))
        else:
            lemmatized_list.append(lemmatizer.lemmatize(word))
    return lemmatized_list

In [15]:
def preprocessed_text(data): 
    reviews = data['review'].to_list()
    sentiments = data['sentimentScore'].to_list()

    word_list = []

    for sentence in reviews:
        words = word_tokenize(sentence)
        for word in words:
            word_list.append(word.lower())

    word_list = [word.lower() for word in word_list if word not in eng_stop_words and word not in symbol and word.isalpha()]
    word_list = lemmatizing(word_list)

    feature_set = []

    labeled_list = list(zip(reviews,sentiments))

    for sentence, label in labeled_list:
        words = []
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word not in eng_stop_words and word not in symbol and word.isalpha()]
        words = lemmatizing(words)

        feature = {}

        for word in words:
            feature[word] = (word in word_list)
        feature_set.append((feature,label))

    return feature_set


In [20]:
def modelling():
    data = loadingDataset()
    feature_set = preprocessed_text(data)
    shuffle(feature_set)
    count = int(0.8*len(feature_set))
    train_data = feature_set[:count]
    test_data = feature_set[count:]

    model = NaiveBayesClassifier.train(train_data)
    file = open('model.pickle','wb')
    pickle.dump(model,file)
    file.close()

    print(model.show_most_informative_features(n=10))
    print(accuracy(model,test_data))
    return model

In [21]:
def checkModel():
    try:
        file = open('model.pickle','rb')
        model = pickle.load(file)
        file.close()
    except:
        model = modelling()

    return model

In [22]:
def writeReview(model):
    while True:
        review = input('Write your review: ')
        reviews = word_tokenize(review)
        if(len(reviews)>20):
            break
    category = model.classify(FreqDist(reviews))
    return review, category

In [47]:
def movieRecommendation(data,user_review):
    vectorizer = CountVectorizer(ngram_range=(1,3))
    review = list(data['review'])
    data_vec = vectorizer.fit_transform(review)
    user_vec = vectorizer.transform([user_review])

    similarity  = cosine_similarity(data_vec,user_vec).flatten()
    top_indices = similarity.argsort()[-2:][::-1]
    print(top_indices)
    counter = 1
    for index in top_indices:
        print(f"{counter}: {data.iloc[index]['title']}")
        counter+=1 

In [46]:
def NER(data):
    nlp = spacy.load('en_core_web_sm')

    category = {}

    for review in data['review']:
        doc = nlp(review)
        
        for ent in doc.ents:
            if ent.label_ not in category:
                category[ent.label_] = []
            category[ent.label_].append(ent.text)

    for label, text in category.items():
        if label == 'LANGUAGE' or label =='LOC':
            print(f"{label}: {', '.join(text)}")

In [48]:
user_review = 0
category = 0

model = checkModel()
data = loadingDataset()

while True:
    print('MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS')
    print(f"YOUR REVIEW: {'NO REVIEW' if user_review == 0 else user_review}")
    print(f"YOUR REVIEW CATEGORY: {'UNKNOWN' if category == 0 else category}")
    print('1. WRITE YOUR REVIEW')
    print('2. VIEW MOVIE RECOMMENDATION')
    print('3. VIEW NAMED ENTITIES RECOGNITION')
    print('4. EXIT')
    option = int(input('>> '))

    if option == 1:
        user_review, category = writeReview(model)
    elif option == 2:
        movieRecommendation(data,user_review)
    elif option == 3:
        NER(data)

MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: NO REVIEW
YOUR REVIEW CATEGORY: UNKNOWN
1. WRITE YOUR REVIEW
2. VIEW MOVIE RECOMMENDATION
3. VIEW NAMED ENTITIES RECOGNITION
4. EXIT
MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: The performances, particularly by Michelle Yeoh, are stellar, driving home the film's emotional core amidst its kaleidoscopic visuals. It's a must-watch for fans of innovative storytelling.
YOUR REVIEW CATEGORY: NEGATIVE
1. WRITE YOUR REVIEW
2. VIEW MOVIE RECOMMENDATION
3. VIEW NAMED ENTITIES RECOGNITION
4. EXIT
[677  68]
1: Ghost in the Shell
2: Austin Powers: International Man of Mystery
MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: The performances, particularly by Michelle Yeoh, are stellar, driving home the film's emotional core amidst its kaleidoscopic visuals. It's a must-watch for fans of innovative storytelling.
YOUR REVIEW CATEGORY: NEGATIVE
1. WRITE YOUR REVIEW
2. VIEW MOVIE RECOMMENDATION
3. VIEW NAMED 

ValueError: invalid literal for int() with base 10: ''