In [1]:
!pip install nltk



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn, Row
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.clustering import KMeans
from pyspark.ml import feature, regression, Pipeline, pipeline, evaluation, tuning, clustering

import matplotlib.pyplot as plt
import numpy as np
import requests
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
movie_df = spark.read.csv('Filtered_movies_plots.csv', header = True)
movie_df.count()

90556

In [4]:
movie_df = movie_df.na.drop()
movie_df.count()

46767

In [5]:
movie_df = movie_df.filter(movie_df['Release Year'] > 1979)
movie_df.show()

+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|            Director|                Cast|          Genre|           Wiki Page|                Plot|
+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|           Marc Webb|Joseph Gordon-Lev...|romantic comedy|https://en.wikipe...|The film is prese...|
|        2006|                 .45|        American|         Gary Lennon|Milla Jovovich, A...|    crime drama|https://en.wikipe...|Big Al (Angus Mac...|
|        2007|              10 MPH|        American|        Hunter Weeks|Josh Caldwell's t...|    documentary|https://en.wikipe...|10 MPH follows th...|
|        2008|           10,000 BC|        American|     Roland Emmerich|Steven St

In [6]:
tokenizer = Tokenizer().setInputCol('Plot').setOutputCol('words')

In [7]:
tokenizer.transform(movie_df).show()

+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|            Director|                Cast|          Genre|           Wiki Page|                Plot|               words|
+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|           Marc Webb|Joseph Gordon-Lev...|romantic comedy|https://en.wikipe...|The film is prese...|[the, film, is, p...|
|        2006|                 .45|        American|         Gary Lennon|Milla Jovovich, A...|    crime drama|https://en.wikipe...|Big Al (Angus Mac...|[big, al, (angus,...|
|        2007|              10 MPH|        American|        Hunter Weeks|Josh Caldwell's t...|    documentary|https://en.wikipe...

In [8]:
count_vectorizer_estimator = CountVectorizer().setInputCol('words').setOutputCol('features')

In [9]:
count_vectorizer_transformer = count_vectorizer_estimator.fit(tokenizer.transform(movie_df))

In [10]:
count_vectorizer_transformer.transform(tokenizer.transform(movie_df)).show(truncate=False)

+------------+-----------------------------+----------------+---------------------------+-----------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
count_vectorizer_transformer.vocabulary

['the',
 'a',
 'to',
 'and',
 'of',
 'in',
 'is',
 'his',
 'with',
 'he',
 'her',
 'by',
 'for',
 'that',
 'who',
 'an',
 'on',
 'as',
 'she',
 'at',
 'from',
 'their',
 'has',
 'they',
 'are',
 'after',
 'him',
 'but',
 'when',
 'into',
 'new',
 'one',
 'be',
 'it',
 'while',
 'up',
 'out',
 'two',
 'have',
 'which',
 'was',
 'film',
 'young',
 'about',
 'not',
 'where',
 'been',
 'named',
 'them',
 'will',
 'during',
 'being',
 'man',
 'father',
 'family',
 'life',
 'only',
 'all',
 'school',
 'home',
 'years',
 'find',
 'before',
 'friend',
 'then',
 'first',
 'mother',
 'also',
 'wife',
 'over',
 'begins',
 'police',
 'other',
 'get',
 'takes',
 'this',
 'through',
 'finds',
 'off',
 'tells',
 'three',
 'had',
 'back',
 'take',
 'story',
 'local',
 'time',
 'york',
 'meets',
 'lives',
 'him.',
 'woman',
 'can',
 'high',
 'becomes',
 'former',
 'son',
 'group',
 'having',
 'however,',
 'living',
 'help',
 'car',
 'friends',
 'become',
 'dr.',
 'go',
 'old',
 'goes',
 'own',
 'so',
 

In [12]:
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()

In [13]:
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol('words')\
  .setOutputCol('filtered')

In [14]:
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol('filtered')\
  .setOutputCol('tf')

In [15]:
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(movie_df)

In [16]:
cv_pipeline.transform(movie_df).show(5)

+------------+--------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|       Director|                Cast|          Genre|           Wiki Page|                Plot|               words|            filtered|                  tf|
+------------+--------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|      Marc Webb|Joseph Gordon-Lev...|romantic comedy|https://en.wikipe...|The film is prese...|[the, film, is, p...|[film, presented,...|(11169,[1,110,141...|
|        2006|                 .45|        American|    Gary Lennon|Milla Jovovich, A...|    crime drama|https://en.wikipe...|Big Al (Angus Mac...|[big, al,

In [17]:
len(cv_pipeline.stages[-1].vocabulary)

11169

In [18]:
idf = IDF(inputCol='tf', outputCol='tfidf', minDocFreq=100)

In [19]:
idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(movie_df)

In [20]:
idf_pipeline.transform(movie_df).show(5)

+------------+--------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|       Director|                Cast|          Genre|           Wiki Page|                Plot|               words|            filtered|                  tf|               tfidf|
+------------+--------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|      Marc Webb|Joseph Gordon-Lev...|romantic comedy|https://en.wikipe...|The film is prese...|[the, film, is, p...|[film, presented,...|(11169,[1,110,141...|(11169,[1,110,141...|
|        2006|                 .45|        American|    Gary Lennon|Mill

In [21]:
tfidf_df = idf_pipeline.transform(movie_df)
tfidf_df = tfidf_df.toPandas()
tfidf_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,words,filtered,tf,tfidf
0,2009,(500) Days of Summer,American,Marc Webb,"Joseph Gordon-Levitt, Zooey Deschanel",romantic comedy,https://en.wikipedia.org/wiki/(500)_Days_of_Su...,The film is presented in a nonlinear narrative...,"[the, film, is, presented, in, a, nonlinear, n...","[film, presented, nonlinear, narrative,, jumpi...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 2.1581263606613175, 0.0, 0.0, 0.0, 0.0, ..."
1,2006,.45,American,Gary Lennon,"Milla Jovovich, Angus Macfadyen, Aisha Tyler, ...",crime drama,https://en.wikipedia.org/wiki/.45_(film),"Big Al (Angus Macfadyen), and his girlfriend K...","[big, al, (angus, macfadyen),, and, his, girlf...","[big, al, (angus, macfadyen),, girlfriend, kat...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2007,10 MPH,American,Hunter Weeks,Josh Caldwell's trip across the United States ...,documentary,https://en.wikipedia.org/wiki/10_MPH,10 MPH follows the progress of Caldwell as he ...,"[10 mph, follows, the, progress, of, caldwell,...","[10 mph, follows, progress, caldwell, rides, s...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 2.1581263606613175, 0.0, 0.0, 0.0, 0.0, ..."
3,2008,"10,000 BC",American,Roland Emmerich,"Steven Strait, Camilla Belle, Cliff Curtis",adventure,"https://en.wikipedia.org/wiki/10,000_BC_(film)","""At about 10,000 BC, a tribe of hunter-gathere...","[""at, about, 10,000, bc,, a, tribe, of, hunter...","[""at, 10,000, bc,, tribe, hunter-gatherers, ca...","(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.17256303560294, 2.332322255089808..."
4,2000,102 Dalmatians,American,Kevin Lima,"Glenn Close, Gérard Depardieu, Alice Evans","comedy, family",https://en.wikipedia.org/wiki/102_Dalmatians,"After three years in prison, Cruella de Vil ha...","[after, three, years, in, prison,, cruella, de...","[years, prison,, cruella, vil, cured, desire, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
7844,1999,eXistenZ,American,David Cronenberg,"Jennifer Jason Leigh, Jude Law, Ian Holm",science fiction thriller,https://en.wikipedia.org/wiki/EXistenZ,"""In the near-future, biotechnological virtual ...","[""in, the, near-future,, biotechnological, vir...","[""in, near-future,, biotechnological, virtual,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7845,2017,iBoy,American,Adam Randall,"Adam Randall (director), Joe Barton (screenpla...","action, crime",https://en.wikipedia.org/wiki/IBoy,"Living in a gang ridden part of London, teenag...","[living, in, a, gang, ridden, part, of, london...","[living, gang, ridden, london,, teenage, boy, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7846,2017,xXx: Return of Xander Cage,American,D. J. Caruso,"D. J. Caruso (director); F. Scott Frazier, Cha...","action, adventure",https://en.wikipedia.org/wiki/XXx:_Return_of_X...,"""NSA Agent Augustus Gibbons attempts to recrui...","[""nsa, agent, augustus, gibbons, attempts, to,...","[""nsa, agent, augustus, gibbons, attempts, rec...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7847,2005,Æon Flux,American,Karyn Kusama,Charlize Theron,science fiction,https://en.wikipedia.org/wiki/%C3%86on_Flux_(f...,"In 2011, a deadly pathogenic virus has killed ...","[in, 2011,, a, deadly, pathogenic, virus, has,...","[2011,, deadly, pathogenic, virus, killed, 99%...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
analyzer = SentimentIntensityAnalyzer()

In [23]:
text = tfidf_df['Plot'].tolist()

In [24]:
plots = []
for plot in text:
    plots.append(str(plot))

In [25]:
textlower = [w.lower() for w in plots]

In [26]:
stopwords = nltk.corpus.stopwords.words('english')

In [27]:
stoppedwords = [w for w in textlower if w not in stopwords]

In [28]:
sentences = []
for sentence in stoppedwords:
    sentences.append(nltk.sent_tokenize(sentence))
sentences

[["the film is presented in a nonlinear narrative, jumping between various days within the 500 days of tom and summer's relationship.",
  'there is an on-screen timer showing the day.',
  'the following is a linear summary of the plot.'],
 ['big al (angus macfadyen), and his girlfriend kat (milla jovovich), are small-time crooks dealing in guns and stolen goods in queens, new york city.',
  'they seem to have a fun relationship full of sex and booze.',
  'living in a shabby apartment, they both traffic in illegal handguns, while also cleverly avoiding the nypd and atf.',
  "however, vic (sarah strange), kat's best friend (and ex-lover) still has a crush on her, hates big al and doesn't approve of the relationship."],
 ['10\xa0mph follows the progress of caldwell as he rides a segway scooter across the united states from seattle to boston, stopping at many places along the way to interact with people.',
  'the film focuses on showing the dynamic nature of the us countryside as well as d

In [29]:
vader = []
for plot in sentences:
    for sentence in plot:
        vader.append(analyzer.polarity_scores(sentence))
vader

[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.122, 'neu': 0.878, 'pos': 0.0, 'compound': -0.4939},
 {'neg': 0.0, 'neu': 0.752, 'pos': 0.248, 'compound': 0.5106},
 {'neg': 0.247, 'neu': 0.617, 'pos': 0.136, 'compound': -0.4019},
 {'neg': 0.146, 'neu': 0.615, 'pos': 0.239, 'compound': 0.5994},
 {'neg': 0.045, 'neu': 0.795, 'pos': 0.159, 'compound': 0.6124},
 {'neg': 0.0, 'neu': 0.842, 'pos': 0.158, 'compound': 0.5719},
 {'neg': 0.161, 'neu': 0.839, 'pos': 0.0, 'compound': -0.6597},
 {'neg': 0.172, 'neu': 0.703, 'pos': 0.125, 'compound': -0.296},
 {'neg': 0.0, 'neu': 0.776, 'pos': 0.224, 'compound': 0.6705},
 {'neg': 0.0, 'neu': 0.939, 'pos': 0.061, 'compound': 0.5106},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.156, 'neu': 0.807, 'pos': 0.038, 'compound': -0.8176},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.05

In [30]:
compound = []
for sentiment in vader:
    compound.append(sentiment['compound'])
compound

[0.0,
 0.0,
 0.0,
 -0.4939,
 0.5106,
 -0.4019,
 0.5994,
 0.6124,
 0.5719,
 -0.6597,
 -0.296,
 0.6705,
 0.5106,
 0.0,
 -0.8176,
 0.0,
 -0.0258,
 -0.1655,
 0.0,
 -0.8779,
 -0.9423,
 -0.4939,
 -0.7506,
 -0.886,
 -0.6486,
 0.2263,
 0.0,
 0.0,
 -0.128,
 0.4215,
 -0.5106,
 -0.8534,
 -0.7964,
 -0.6369,
 0.0,
 0.8126,
 0.0,
 0.0,
 0.0,
 0.1531,
 0.6808,
 -0.5423,
 0.0258,
 0.1779,
 -0.128,
 0.0,
 0.5994,
 0.0,
 -0.7096,
 0.7506,
 0.802,
 -0.765,
 -0.8731,
 -0.0772,
 -0.4019,
 -0.6652,
 0.34,
 0.0,
 0.6369,
 -0.4404,
 0.0,
 0.0,
 0.0,
 0.5859,
 0.3612,
 -0.4939,
 -0.9062,
 0.0,
 -0.8519,
 0.0,
 -0.4404,
 -0.4588,
 -0.5719,
 -0.3182,
 0.34,
 0.8047,
 -0.9571,
 -0.5106,
 -0.6249,
 0.0,
 0.0,
 0.1027,
 -0.25,
 -0.8402,
 -0.3182,
 -0.8126,
 0.4215,
 -0.5267,
 0.5994,
 0.25,
 -0.3182,
 0.9337,
 0.0,
 0.0,
 -0.9442,
 0.1531,
 0.5719,
 0.6369,
 0.34,
 -0.7096,
 0.1027,
 -0.34,
 0.4767,
 0.0,
 0.0,
 0.7086,
 0.0,
 0.2732,
 -0.0258,
 -0.802,
 0.0,
 -0.6486,
 0.4215,
 0.0516,
 0.8327,
 0.0772,
 0.0,
 0.5

In [31]:
vader_sentence = []
for plot in sentences:
    for sentence in plot:
        vader_sentence.append(sentence)
vader_sentence

["the film is presented in a nonlinear narrative, jumping between various days within the 500 days of tom and summer's relationship.",
 'there is an on-screen timer showing the day.',
 'the following is a linear summary of the plot.',
 'big al (angus macfadyen), and his girlfriend kat (milla jovovich), are small-time crooks dealing in guns and stolen goods in queens, new york city.',
 'they seem to have a fun relationship full of sex and booze.',
 'living in a shabby apartment, they both traffic in illegal handguns, while also cleverly avoiding the nypd and atf.',
 "however, vic (sarah strange), kat's best friend (and ex-lover) still has a crush on her, hates big al and doesn't approve of the relationship.",
 '10\xa0mph follows the progress of caldwell as he rides a segway scooter across the united states from seattle to boston, stopping at many places along the way to interact with people.',
 'the film focuses on showing the dynamic nature of the us countryside as well as documenting 

In [32]:
lengths = []
for plot in sentences:
    count = 0
    for sentence in plot:
        count += 1
    lengths.append(count)

In [33]:
lengths = np.array(lengths)
lengths2 = np.cumsum(lengths)
lengths2

array([    3,     7,     9, ..., 29325, 29326, 29333])

In [34]:
output = []
prev = 0

for index in lengths2:
    output.append(compound[prev:index])
    prev = index
output

[[0.0, 0.0, 0.0],
 [-0.4939, 0.5106, -0.4019, 0.5994],
 [0.6124, 0.5719],
 [-0.6597, -0.296, 0.6705, 0.5106, 0.0],
 [-0.8176, 0.0, -0.0258],
 [-0.1655],
 [0.0, -0.8779, -0.9423, -0.4939, -0.7506, -0.886, -0.6486],
 [0.2263],
 [0.0, 0.0, -0.128],
 [0.4215, -0.5106, -0.8534, -0.7964, -0.6369],
 [0.0, 0.8126, 0.0, 0.0, 0.0],
 [0.1531, 0.6808],
 [-0.5423, 0.0258, 0.1779, -0.128],
 [0.0],
 [0.5994, 0.0],
 [-0.7096, 0.7506, 0.802, -0.765, -0.8731, -0.0772],
 [-0.4019, -0.6652, 0.34, 0.0, 0.6369],
 [-0.4404],
 [0.0, 0.0],
 [0.0],
 [0.5859, 0.3612, -0.4939],
 [-0.9062, 0.0, -0.8519, 0.0, -0.4404],
 [-0.4588, -0.5719, -0.3182, 0.34, 0.8047],
 [-0.9571, -0.5106, -0.6249, 0.0, 0.0, 0.1027],
 [-0.25, -0.8402, -0.3182, -0.8126],
 [0.4215, -0.5267],
 [0.5994, 0.25, -0.3182, 0.9337, 0.0],
 [0.0, -0.9442],
 [0.1531, 0.5719, 0.6369, 0.34],
 [-0.7096],
 [0.1027, -0.34, 0.4767, 0.0],
 [0.0, 0.7086, 0.0, 0.2732, -0.0258, -0.802, 0.0, -0.6486, 0.4215],
 [0.0516, 0.8327, 0.0772, 0.0, 0.5574],
 [-0.0258, 0.0

In [35]:
average_sentiment = []
for value in output:
    average_sentiment.append(sum(value)/len(value))
tfidf_df['sentiment'] = average_sentiment

In [36]:
tfidf_df = tfidf_df.drop(columns=['Wiki Page', 'Plot', 'words', 'filtered', 'tf'])
tfidf_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,tfidf,sentiment
0,2009,(500) Days of Summer,American,Marc Webb,"Joseph Gordon-Levitt, Zooey Deschanel",romantic comedy,"(0.0, 2.1581263606613175, 0.0, 0.0, 0.0, 0.0, ...",0.000000
1,2006,.45,American,Gary Lennon,"Milla Jovovich, Angus Macfadyen, Aisha Tyler, ...",crime drama,"(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.053550
2,2007,10 MPH,American,Hunter Weeks,Josh Caldwell's trip across the United States ...,documentary,"(0.0, 2.1581263606613175, 0.0, 0.0, 0.0, 0.0, ...",0.592150
3,2008,"10,000 BC",American,Roland Emmerich,"Steven Strait, Camilla Belle, Cliff Curtis",adventure,"(0.0, 0.0, 2.17256303560294, 2.332322255089808...",0.045080
4,2000,102 Dalmatians,American,Kevin Lima,"Glenn Close, Gérard Depardieu, Alice Evans","comedy, family","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-0.281133
...,...,...,...,...,...,...,...,...
7844,1999,eXistenZ,American,David Cronenberg,"Jennifer Jason Leigh, Jude Law, Ian Holm",science fiction thriller,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
7845,2017,iBoy,American,Adam Randall,"Adam Randall (director), Joe Barton (screenpla...","action, crime","(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.238575
7846,2017,xXx: Return of Xander Cage,American,D. J. Caruso,"D. J. Caruso (director); F. Scott Frazier, Cha...","action, adventure","(1.8004596264600101, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-0.329850
7847,2005,Æon Flux,American,Karyn Kusama,Charlize Theron,science fiction,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-0.670500


In [37]:
new_df = spark.createDataFrame(tfidf_df) 
new_df.show()

+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|            Director|                Cast|          Genre|               tfidf|           sentiment|
+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|           Marc Webb|Joseph Gordon-Lev...|romantic comedy|(11169,[1,110,141...|                 0.0|
|        2006|                 .45|        American|         Gary Lennon|Milla Jovovich, A...|    crime drama|(11169,[0,11,22,3...| 0.05355000000000003|
|        2007|              10 MPH|        American|        Hunter Weeks|Josh Caldwell's t...|    documentary|(11169,[1,74,91,9...|             0.59215|
|        2008|           10,000 BC|        American|     Roland Emmerich|Steven St

In [38]:
va = feature.VectorAssembler(inputCols=['tfidf', 'sentiment'], outputCol='features0')
ss = feature.StandardScaler(inputCol='features0', outputCol='features') 
pipe = Pipeline(stages=[va, ss])
plots_clustering = pipe.fit(new_df)

In [39]:
clustered = plots_clustering.transform(new_df)
clustered.show()

+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
|Release Year|               Title|Origin/Ethnicity|            Director|                Cast|          Genre|               tfidf|           sentiment|           features0|            features|
+------------+--------------------+----------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
|        2009| (500) Days of Su...|        American|           Marc Webb|Joseph Gordon-Lev...|romantic comedy|(11169,[1,110,141...|                 0.0|(11170,[1,110,141...|(11170,[1,110,141...|
|        2006|                 .45|        American|         Gary Lennon|Milla Jovovich, A...|    crime drama|(11169,[0,11,22,3...| 0.05355000000000003|(11170,[0,11,22,3...|(11170,[0,11,22,3...|
|        2007|           

In [40]:
kmeans = KMeans(k=100)

model = kmeans.fit(clustered)

results = model.transform(clustered)
results.cache()

predictions = results.groupBy('prediction').count()

In [41]:
predictions.orderBy(fn.col('count').desc()).take(100)

[Row(prediction=0, count=3070),
 Row(prediction=13, count=708),
 Row(prediction=94, count=232),
 Row(prediction=10, count=231),
 Row(prediction=26, count=152),
 Row(prediction=63, count=144),
 Row(prediction=33, count=144),
 Row(prediction=5, count=143),
 Row(prediction=35, count=130),
 Row(prediction=24, count=117),
 Row(prediction=95, count=115),
 Row(prediction=11, count=112),
 Row(prediction=98, count=112),
 Row(prediction=1, count=107),
 Row(prediction=29, count=107),
 Row(prediction=84, count=102),
 Row(prediction=68, count=101),
 Row(prediction=39, count=99),
 Row(prediction=18, count=98),
 Row(prediction=12, count=97),
 Row(prediction=27, count=91),
 Row(prediction=59, count=91),
 Row(prediction=91, count=89),
 Row(prediction=80, count=89),
 Row(prediction=90, count=88),
 Row(prediction=86, count=86),
 Row(prediction=23, count=84),
 Row(prediction=89, count=84),
 Row(prediction=50, count=83),
 Row(prediction=71, count=82),
 Row(prediction=73, count=80),
 Row(prediction=25, coun

In [42]:
test = results.where(fn.col('prediction') == 21)
test = test.select(fn.col('Title'), fn.col('Release Year'), fn.col('Genre'), fn.col('prediction'))
test.show()

+--------------------+------------+--------------------+----------+
|               Title|Release Year|               Genre|prediction|
+--------------------+------------+--------------------+----------+
|    The Tao of Steve|        2000|              comedy|        21|
|               Venom|        2005|              horror|        21|
|Adventures of Joh...|        2007|martial arts/horr...|        21|
|               Andre|        1994|       family, drama|        21|
|   Anywhere but Here|        1999|               drama|        21|
|             Clubbed|        2009|               drama|        21|
|                 Den|        2001|            thriller|        21|
|   Detroit Rock City|        1999|              comedy|        21|
|    Don Juan DeMarco|        1995|              comedy|        21|
|       Eat the Peach|        1986|              comedy|        21|
|    Extreme Measures|        1996|            thriller|        21|
|  Festival in Cannes|        2001|             