In [204]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing
import time
import textwrap
import pickle
import en_core_web_sm
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from scipy import spatial
from textblob import TextBlob, Word, Blobber
from scipy import spatial
from textblob import TextBlob, Word, Blobber

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer

import multiprocessing
cores = multiprocessing.cpu_count()


  from pandas import Panel


## Feature engineering of doc2vec vectors and sentiment

In [13]:
def cleanText(text):
    """
    Cleans text for doc2vec
    """
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = text.lower()
    text = text.replace('x', '')
    return text

In [173]:
def tokenize_text(text):
    """
    Returns tokenized text
    """
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [186]:
def vec_for_learning(model, tagged_docs):
    """
    Returns feature vector, given model
    """
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [174]:
def get_feature_vector(train_tagged, test_tagged, dm):
    """
    Build model and vocabulary
    """
    if dm == 'dbow':
        model = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
        model.build_vocab([x for x in tqdm(train_tagged.values)])
    elif dm == 'dmm':
        model = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
        model.build_vocab([x for x in tqdm(train_tagged.values)])  
    """
    Train model
    """
    for epoch in range(30):
        model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
        model.alpha -= 0.002
        model.min_alpha = model.alpha
    """
    Build feature vectors
    """
    y_train, X_train = vec_for_learning(model, train_tagged)
    y_test, X_test = vec_for_learning(model, test_tagged)
    """
    Return feature vectors
    """
    return X_train, X_test, y_train, y_test, model

In [175]:
def get_cosine_dist(model, begin_text,end_text):
    """
    Get cosine distance between two feature vectors, given a doc2vec model
    """
    begin = model.infer_vector(begin_text.split())
    end = model.infer_vector(end_text.split())
    return spatial.distance.cosine(begin,end)

In [176]:
def get_beginning(text):
    """
    Returns first third of text
    """
    divis = int(len(text)/3)
    beginning = list(map(''.join, zip(*[iter(text)]*divis)))[0]
    return beginning

In [213]:
def get_ending(text):
    """
    Returns last third of text
    """
    divis = int(len(text)/3)
    ending = list(map(''.join, zip(*[iter(text)]*divis)))[2]
    return ending

In [214]:
def get_sentiment_diff(begin, end):
    """
    Get difference in seniment polarity from beginning and end
    """
    begin_sent = TextBlob(begin)
    end_sent = TextBlob(end)
    return (float(begin_sent.sentiment[0]) - float(end_sent.sentiment[0]))

In [215]:
def make_features_df(indices, feature_vect):
    """
    Generate final feature df with flattened feature vect and split script features
    """
    indexed_feature_vect = pd.concat([indices, pd.Series(feature_vect)], axis=1).set_index(0)
    merged_df = pd.merge(indexed_feature_vect,split_script,left_index=True,right_index=True)[[1,'cos_dist','log_cos_dist','sentiment_diff']]
    merged_df = pd.DataFrame(merged_df.to_records())
    merged_df.columns = ['index','feature_vect','cos_dist','log_cos_dist','sentiment_diff']
    merged_df = merged_df.drop('index',axis=1)
    merged_df['feature_list'] = merged_df.feature_vect.tolist()
    final_df = pd.concat([merged_df[['cos_dist','log_cos_dist','sentiment_diff']],merged_df.feature_list.apply(pd.Series)], axis=1)
    return final_df

In [216]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    '''
    Returns stemmed and lemmatized text
    '''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    """
    Returns text, with names and stop words filtered out
    """
    nlp = en_core_web_sm.load()
    result = []
    doc = nlp(text)
    names = list(dict.fromkeys([X.text.lower() for X in doc.ents if X.label_ == 'PERSON']))
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2 and token not in names:
            result.append(lemmatize_stemming(token))
    return result

In [217]:
def get_lda_model(bow_corpus, num_topics, dictionary, processed_docs):
    """
    Returns LDA model and model metrics
    """
    lda_model = gensim.models.LdaMulticore(train_bow_corpus, 
                                       num_topics=num_topics, 
                                       id2word=train_dict,
                                       passes=10,
                                       eta = 0.01,
                                       workers=13)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=train_processed_docs, dictionary=train_dict, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    perplexity = lda_model.log_perplexity(train_bow_corpus)
    return(lda_model, coherence, perplexity)

In [218]:
def get_vecs(X, lda_model, bow_corpus, num_topics):
    """
    Returns feature vectors for each document
    """
    vecs = []
    for i in range(len(X)):
        top_topics = lda_model.get_document_topics(bow_corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        vecs.append(topic_vec)
    return vecs

In [219]:
def get_bigram(df):
    """
    Return bigrams from test data
    """
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram

Read in data.

In [209]:
data = pd.read_csv('cleaned_moviedataset.csv')

data = data[['Scripts','new_genres']]
data.columns = ['Scripts','Genres']

Clean text for processing.

In [182]:
data.Scripts = data.Scripts.apply(cleanText)

Split data into test and train, and save index values.

In [183]:
train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data.Genres)
train_index = pd.Series(train.index)
test_index = pd.Series(test.index)

with open('train_index.pickle', 'wb') as f:
    pickle.dump(train_index, f)
with open('test_index.pickle', 'wb') as f:
    pickle.dump(test_index, f)

Tag data for doc2vec.

In [184]:
train_tagged_scripts = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Scripts']), tags=[r.Genres]), axis=1)
test_tagged_scripts = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Scripts']), tags=[r.Genres]), axis=1)

Train model and extract feature vectors.

(Notebook default is to load objects from pickle, due to long object generation time.)

In [187]:
#X_train_scripts_dmm,X_test_scripts_dmm,y_train_scripts_dmm,y_test_scripts_dmm, model_scripts_dmm = get_feature_vector(train_tagged_scripts, test_tagged_scripts, "dmm")

100%|██████████| 2640/2640 [00:00<00:00, 1653668.24it/s]
100%|██████████| 2640/2640 [00:00<00:00, 2047893.94it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1726643.16it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1884759.58it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1939902.34it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1578244.38it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1682565.35it/s]
100%|██████████| 2640/2640 [00:00<00:00, 2002344.04it/s]
100%|██████████| 2640/2640 [00:00<00:00, 2025418.43it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1903224.92it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1936509.72it/s]
100%|██████████| 2640/2640 [00:00<00:00, 2009247.43it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1759009.14it/s]
100%|██████████| 2640/2640 [00:00<00:00, 2020244.95it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1170874.76it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1241502.70it/s]
100%|██████████| 2640/2640 [00:00<00:00, 1998729.70it/s]
100%|██████████| 2640/2640 [00:

In [127]:
X_train_scripts_dmm = pickle.load(open( "X_train_dmm.pickle", "rb"))
y_train_scripts_dmm = pickle.load(open( "y_train_dmm.pickle", "rb"))
X_test_scripts_dmm = pickle.load(open( "X_test_dmm.pickle", "rb"))
y_test_scripts_dmm = pickle.load(open( "y_test_dmm.pickle", "rb"))

dmm_model = pickle.load(open('model_scripts_dmm.pickle','rb'))

Calculate topic change as feature vector cosine distance from movie's beginning(first third) to movie's ending(last third).

In [189]:
data.Scripts = data.Scripts.astype(str)

In [190]:
split_script = pd.DataFrame()

split_script['beginning'] = data.Scripts.map(lambda x: get_beginning(x))
split_script['ending'] = data.Scripts.map(lambda x: get_ending(x))
split_script['cos_dist']=split_script.apply(lambda x: get_cosine_dist(dmm_model,x.beginning,x.ending),axis=1)
split_script['log_cos_dist'] = np.log(split_script.cos_dist)

Calculate sentiment change as difference of sentiment polarity.

In [191]:
split_script['sentiment_diff'] = split_script.apply(lambda x: get_sentiment_diff(x.beginning, x.ending),axis=1)

Merge split script metrics with train and test feature vectors, to form final test and train.

In [192]:
split_script = split_script.drop(['beginning','ending'],axis=1)

In [193]:
X_train = make_features_df(train_index, X_train_scripts_dmm)
X_test = make_features_df(test_index, X_test_scripts_dmm)

Assign targets for train and test.

In [194]:
y_train = y_train_scripts_dmm
y_test = y_test_scripts_dmm

Pickle data for modeling.

In [None]:
"""with open('X_train.pickle', 'wb') as f:
    pickle.dump(X_train, f)
with open('X_test.pickle', 'wb') as f:
    pickle.dump(X_test, f)
with open('y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
with open('y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)"""