In [1]:
import pandas
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

data = pandas.read_csv('wiki_plots.csv')
pandas.set_option('display.max_columns', 25)
data

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....


In [2]:
#Finding the most commonly occuring genre for use as a baseline
data['Genre'].value_counts()

unknown                                                          6083
drama                                                            5964
comedy                                                           4379
horror                                                           1167
action                                                           1098
thriller                                                          966
romance                                                           923
western                                                           865
crime                                                             568
adventure                                                         526
musical                                                           467
crime drama                                                       464
romantic comedy                                                   461
science fiction                                                   418
film noir           

In [3]:
#The function I will be using to clean all of the plot text
def cleaning(raw):
    rev_soup = BeautifulSoup(raw).get_text() 
    letters_only = re.sub("[^a-zA-Z]"," ", rev_soup)
    lower_case = letters_only.lower()
    words = lower_case.split()
    stops = stopwords.words("english")
    return(" ".join(words))

In [4]:
#Double checking how many movies there actually are to analyze
num_plots = data["Plot"].size
num_plots

34886

In [5]:
#Sending every movie plot in the dataset through the cleaning function for use in models
#and looking at one review to see if the function worked as intended
clean_plots = []

for i in range(0, num_plots):
    clean_plots.append(cleaning(data["Plot"][i]))
    
clean_plots[1000]

'loretta young plays the part of an orphan who has been raised by two thieves raymond hatton and george barraud and does not know that she has a twin sister who is now a wealthy socialite loretta young as margaret waring one day while she is dining at a chinese restaurant with her two guardians they notice the wealthy socialite and are taken aback at how closely she resembles young hatton and barraud convince young that she should impersonate the socialite so that they can enter her house and steal the contents of her safe young enters the house and meet jack mulhall who senses something different about waring and immediately falls in love with young when night falls young lets hatton and barraud into the house and they attempt to open the safe waring happens to enter the house and is shocked to find a woman that looks like her she is wounded by barraud and young tricks the police into thinking that waring is an imposter and thief even though mulhall knows the truth he keeps quiet beca

In [6]:
#Splitting the full dataset into training and testing sets
(train_plots, test_plots, train_target, test_target) = \
    train_test_split(clean_plots, data["Genre"], test_size = 0.2)

In [7]:
#Finding and displaying the 5000 most common words in the training set
vectorizer = CountVectorizer(analyzer = 'word', max_features = 5000)

vectorizer.fit(train_plots)

print(vectorizer.get_feature_names())



In [8]:
#Transforming training and testing set plot strings into word counts of the above results
train_word_columns = vectorizer.transform(train_plots).toarray()
test_word_columns = vectorizer.transform(test_plots).toarray()

print(train_word_columns)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


The few cells above (Starting with the cleaning function) are structured to clean the text of the movie plots and prepare them for use within the algorithms below.

In [9]:
#A plain MultinomialNB algorithm
mnb = MultinomialNB()
mnb.fit(train_word_columns, train_target)
preds = mnb.predict(test_word_columns)

print(accuracy_score(preds,test_target))

0.3233018056749785


In [10]:
#A MultinomialNB with adjusted alpha level
mnb = MultinomialNB(alpha = 10)
mnb.fit(train_word_columns, train_target)
preds = mnb.predict(test_word_columns)

print(accuracy_score(preds,test_target))

0.3648609916881628


The poor accuracy of the MultinomialNBs is disappointing, but not completely unexpected. My cell above that contains "data['Genre'].value_counts()" shows that there are many genres that only appear once in the whole dataset, mainly on very old or less well known films. I have no doubt that these oddly specific genre types had a negative impact on my algorithm - if the dataset were to be cleaned in a way that removed these genres and replaced them with something more typical like "action" or "comedy", then it would have performed much better.

In [11]:
#Performing PCA for use with a Support Vector Machine
extractor = PCA(n_components = 2, whiten = True)
extractor.fit(train_word_columns)

print(extractor.explained_variance_ratio_)

train_transformed = extractor.transform(train_word_columns)
test_transformed = extractor.transform(test_word_columns)

[0.54797279 0.05178518]


In [13]:
#A plain Support Vector Machine algorithm
model = LinearSVC()
model.fit(train_transformed, train_target)
predictions = model.predict(test_transformed)

print(accuracy_score(test_target, predictions))

0.1878761822871883




In [14]:
#A Support Vector Machine with adjusted C value and maximum iterations
model = LinearSVC(C = 5.0, max_iter = 2000)
model.fit(train_transformed, train_target)
predictions = model.predict(test_transformed)

print(accuracy_score(test_target, predictions))

0.18586987675551733




These SVMs both have even worse accuracy than the naive bayes algorithms, and it doesn't seem like adjusting C affects the accuracy, nor does raising or lowering the maximum number of iterations. Both of these accuracies are at least slightly higher than the baseline, although not by much. Again with my MultinomialNB algorithms, I'd have to assume that the terrible accuracy is due to the strangely specific genres present in the dataset.

In [15]:
#Out of curiosity, I set up another training and testing set for use with the movie's country of origin as a target
#instead of the genre, since in most cases, the genre's availability in the dataset was related to where it was made
(train_plots2, test_plots2, train_target2, test_target2) = \
    train_test_split(clean_plots, data["Origin/Ethnicity"], test_size = 0.2)

train_word_columns2 = vectorizer.transform(train_plots2).toarray()
test_word_columns2 = vectorizer.transform(test_plots2).toarray()

print(train_word_columns2)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


In [16]:
#A plain MultinomialNB algorithm
mnb = MultinomialNB()
mnb.fit(train_word_columns2, train_target2)
preds = mnb.predict(test_word_columns2)

print(accuracy_score(preds, test_target2))

0.6142161077672685


In [17]:
#Performing PCA for use with a Support Vector Machine
extractor = PCA(n_components = 2, whiten = True)
extractor.fit(train_word_columns2)

print(extractor.explained_variance_ratio_)

train_transformed2 = extractor.transform(train_word_columns2)
test_transformed2 = extractor.transform(test_word_columns2)

[0.54848216 0.05188953]


In [18]:
#A plain Support Vector Machine algorithm
model = LinearSVC()
model.fit(train_transformed2, train_target2)
predictions = model.predict(test_transformed2)

print(accuracy_score(test_target2, predictions))

0.5058756090570364




These two algorithms have results that are roughly the same as the genre models to, at least as far as baseline goes. America accounts for roughly half of the movies' origins (~49%), and my results of 61.4% and 50.6% seem to reflect the improvements from my earlier algorithms raising the genre baseline of ~17% to 36% and 18%, respectively. I am a little bit surprised that the accuracy of these two models was even as good as it is - I couldn't imagine that the plot synopsis would be very telling as far as where the movie was made, but I guess it works well enough to improve upon the baseline.