# Installations

In [1]:
!pip install num2words
!pip install vaderSentiment
!pip install streamlit

Collecting num2words
[?25l  Downloading https://files.pythonhosted.org/packages/eb/a2/ea800689730732e27711c41beed4b2a129b34974435bdc450377ec407738/num2words-0.5.10-py3-none-any.whl (101kB)
[K     |███▎                            | 10kB 10.0MB/s eta 0:00:01[K     |██████▌                         | 20kB 15.7MB/s eta 0:00:01[K     |█████████▊                      | 30kB 11.8MB/s eta 0:00:01[K     |█████████████                   | 40kB 8.7MB/s eta 0:00:01[K     |████████████████▏               | 51kB 5.5MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 5.3MB/s eta 0:00:01[K     |██████████████████████▋         | 71kB 5.9MB/s eta 0:00:01[K     |█████████████████████████▉      | 81kB 6.2MB/s eta 0:00:01[K     |█████████████████████████████   | 92kB 6.0MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.3MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10
Collecting vaderSentiment
[?25l  Downloading https://f

# Imports

In [2]:
# Imports

import gensim
import re
import pandas as pd
import numpy as np
import time
import random
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
import nltk
import string
from num2words import num2words
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

import os
SCRIPT_PATH = '/content/content/MyDrive/Star Wars Side Project/Scripts'

import gensim.downloader as api
model = api.load('glove-wiki-gigaword-50') #downloading a large pre-trained model

sw = set()
for word in STOPWORDS: #removing punctuation since we'll be filtering after removing punctuation
  sw.add(word.translate(str.maketrans('', '', string.punctuation)))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Functions

In [3]:
"""
Read in script and return dataframe with character and line for each line in the movie
"""

def read_script(script_name):
  with open(os.path.join(SCRIPT_PATH, script_name), 'r') as file: #SCRIPT_PATH defined above
    script_full = file.read()

  script_full = re.sub(r'\"[0-9]+\"', '', script_full) #reformatting script based on Star Wars styling
  script_full = script_full.replace('"character" "dialogue"\n ', '')

  script = re.findall(r'\"[A-Z]+\"\s\".*\"\n', script_full)
  clean_script = []
  for line in script:
    character, dialogue = line.split('" "')
    character = character.replace('"', '')
    dialogue = dialogue.replace('"', '')
    dialogue = dialogue.replace('\n', '')
    clean_script.append([character, dialogue])

  script_df = pd.DataFrame(clean_script, columns=['character', 'line']) #dataframe with character and line for each line in the movie
  return script_df

In [4]:
"""
Preprocessing script text
"""

def process_text(text):
  text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
  text = re.sub(r'\s\s+', ' ', text) #replace double spaces with single spaces
  text = text.lower() #set to lowercase
  text = text.strip() #strip any additional whitespace
  text = re.sub(r'\d+', lambda x: num2words(int(x.group(0))), text) #replace numbers with string literals

  return [word for word in nltk.word_tokenize(text) if word not in sw] #tokenize and remove stopwords

def process_script(script_df):
  processed_lines = [] #preprocess lines from script and add preprocessed lines back to dataframe
  for i in range(len(script_df)):
    processed_lines.append(process_text(script_df.loc[i, 'line']))
  script_df['processed_lines'] = np.array(processed_lines)
  script_df['line_count'] = script_df.groupby('character')['character'].transform('count')
  return script_df

In [5]:
"""
Create WordCloud
"""

#CODE HERE FOUND AT https://amueller.github.io/word_cloud/auto_examples/a_new_hope.html
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

def masked_wordcloud(img_path, script_df):
  photo = np.array(Image.open(img_path)) #takes in an image path and dataframe, and generates a wordcloud for that image

  wordcloud_text = ''
  for i in range(len(script_df)):
    wordcloud_text += ' '.join(script_df.loc[i, 'processed_lines'])

  wc = WordCloud(background_color='black', mask=photo, stopwords=sw, margin=10, contour_color='grey', contour_width=2, repeat=True).generate(wordcloud_text)
  # store default colored image
  plt.figure(figsize=(20, 16))
  plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation="bilinear")
  plt.axis("off")
  plt.show()

In [6]:
"""
Sentiment Analysis
"""

def sentiment_analysis(script_df):
  sid = SentimentIntensityAnalyzer()
  for i in range(len(script_df)): #sentiment analysis for every line in the script
    vader_scores = sid.polarity_scores(script_df.loc[i, 'line'])
    for key in vader_scores:
      script_df.loc[i, key] = vader_scores[key]
  
  return script_df

def plot_sentiment_time_series(script_df):
  df = script_df[script_df['line_count'] >= 15].reset_index()
  fig = px.line(df, x='index', y='compound', color='character')

  return fig

def sentiment_bar(script_df):
  bar_df = script_df[script_df['line_count'] >= 10] #filtering out characters with fewer than 10 lines
  bar_df = bar_df.groupby('character').mean('compound').reset_index()
  bar_df = bar_df[bar_df['compound'] != 0]
  fig = px.bar(bar_df, x='character', y='compound', color='line_count', color_continuous_scale='Plasma', labels={'character': 'Character', 'compound': 'Average Sentiment', 'line_count': 'Line Count'})

  return fig

In [7]:
"""
Embeddings
"""

def full_embeddings(script_df):
  avg_embeddings = [] #average embeddings of all recognized words in every line
  for i in range(len(script_df)):
    line = script_df.loc[i, 'processed_lines']
    embeddings = np.zeros(shape=(50,), dtype=np.float32)
    recognized_words = 0
    for word in line:
      try:
        embeddings += model[word]
        recognized_words += 1
      except KeyError:
        pass
    avg_embeddings.append(embeddings/recognized_words)
  embeddings = pd.DataFrame(avg_embeddings)
  embeddings = script_df[['character']].merge(embeddings, left_index=True, right_index=True)

  return embeddings

In [8]:
"""
Average Embeddings
"""

def average_embeddings(script_df):
  embeddings = full_embeddings(script_df)
  avg_embeddings = embeddings.groupby('character').mean().dropna() #getting average embeddings for each character for all their lines

  return avg_embeddings

In [9]:
"""
TSNE
"""

def tsne_df(avg_embeddings, script_df):
  explained_var = 0
  i = 2
  while explained_var < 0.975: #find the number of components with PCA which gives more than 97.5% explained variance, then conduct TSNE
    pca = PCA(n_components=i, random_state=6)
    pca.fit(avg_embeddings.to_numpy())
    explained_var = pca.explained_variance_ratio_.sum()
    i += 1
  tsne = TSNE(n_components=2, perplexity=10.0, random_state=6)
  pca = PCA(n_components=i).fit_transform(avg_embeddings.to_numpy(dtype=np.float32))

  values = tsne.fit_transform(pca)
  aggregated_embeddings = pd.DataFrame(values, columns=['x', 'y'], index=avg_embeddings.index)

  return aggregated_embeddings.reset_index().merge(script_df.groupby('character').mean('compound').reset_index(), left_on='character', right_on='character')

def plot_tsne(scatter_df):
  fig = px.scatter(scatter_df, x='x', y='y', text='character', color='compound', range_color=(scatter_df['compound'].quantile(0.15), scatter_df['compound'].quantile(0.85)), labels={'character': 'Character', 'compound': 'Average Sentiment'}, title='TSNE Character Representations in 2D, Sized by Line Count')
  fig.update_traces(marker = {'size': 10+scatter_df['line_count']}, textposition='top center')
  
  return fig

In [10]:
"""
Combining some functions
"""

def read_script_preprocess(script_name):
  script_df = read_script(script_name) #read text
  script_df = process_script(script_df) #process lines
  script_df = sentiment_analysis(script_df) #conduct sentiment analysis
  #goes from reading the script to being ready to get embeddings and/or make plots
  return script_df

# File Creation

In [12]:
from google.colab import drive
drive.mount('content')

Mounted at content


In [13]:
os.listdir(SCRIPT_PATH)

['SW_EpisodeVI.txt', 'SW_EpisodeV.txt', 'SW_EpisodeIV.txt']

In [14]:
# getting sentiment/base statistics, embeddings, aggregated embeddings, and TSNE coordinates from scripts

scripts = pd.DataFrame()
embeds = pd.DataFrame()
avg_embeds = pd.DataFrame()
tsne = pd.DataFrame()

for script in os.listdir(SCRIPT_PATH):
  movie = script.replace('SW_', '')
  movie = movie.replace('.txt', '')
  df = read_script_preprocess(script)
  embeddings = full_embeddings(df)
  avg_embeddings = average_embeddings(df)

  tsne_coords = tsne_df(avg_embeddings, df)
  df.insert(1, 'film', movie)
  embeddings.insert(1, 'film', movie)
  avg_embeddings.insert(1, 'film', movie)
  tsne_coords.insert(1, 'film', movie)

  scripts=pd.concat([scripts, df])
  embeds=pd.concat([embeds, embeddings])
  avg_embeds=pd.concat([avg_embeds, avg_embeddings])
  tsne=pd.concat([tsne, tsne_coords])

In [15]:
scripts

Unnamed: 0,character,film,line,processed_lines,line_count,neg,neu,pos,compound
0,OFFICER,EpisodeVI,Inform the commander that Lord Vader's shuttle...,"[inform, commander, lord, vaders, shuttle, arr...",1,0.0,1.000,0.000,0.0000
1,OPERATOR,EpisodeVI,"Yes, sir.","[yes, sir]",1,0.0,0.270,0.730,0.4019
2,JERJERROD,EpisodeVI,"Lord Vader, this is an unexpected pleasure. W...","[lord, vader, unexpected, pleasure, honored, p...",7,0.0,0.571,0.429,0.8176
3,VADER,EpisodeVI,"You may dispense with the pleasantries, Comman...","[may, dispense, pleasantries, commander, put, ...",43,0.0,0.859,0.141,0.3182
4,JERJERROD,EpisodeVI,"I assure you, Lord Vader, my men are working a...","[assure, lord, vader, men, working, fast]",7,0.0,0.844,0.156,0.3400
...,...,...,...,...,...,...,...,...,...
889,LUKE,EpisodeIV,"Oh, no!",[oh],254,0.0,1.000,0.000,0.0000
890,THREEPIO,EpisodeIV,"Oh, my! Artoo! Can you hear me? Say somethi...","[oh, artoo, hear, say, somethingyou, repair]",119,0.0,1.000,0.000,0.0000
891,TECHNICIAN,EpisodeIV,We'll get to work on him right away.,"[work, right, away]",1,0.0,1.000,0.000,0.0000
892,THREEPIO,EpisodeIV,"You must repair him! Sir, if any of my circui...","[must, repair, sir, circuits, gears, will, hel...",119,0.0,0.748,0.252,0.6588


In [16]:
# exporting CSVs

scripts.to_csv(os.path.join('/content/content/MyDrive/Star Wars Side Project', 'scripts.csv'), index=False)
embeds.to_csv(os.path.join('/content/content/MyDrive/Star Wars Side Project', 'embeds.csv'), index=False)
avg_embeds.to_csv(os.path.join('/content/content/MyDrive/Star Wars Side Project', 'avg_embeds.csv'), index=True)
tsne.to_csv(os.path.join('/content/content/MyDrive/Star Wars Side Project', 'tsne.csv'), index=False)

Line count information

In [17]:
from collections import Counter
total_count = Counter()
film_word_counts = dict()
for film in scripts['film'].unique():
  c = Counter()
  for line in scripts[scripts['film'] == film]['processed_lines']:
    c += Counter(line)
    total_count += Counter(line)
  film_word_counts[film] = pd.DataFrame.from_dict(c, orient='index').reset_index().rename(columns={'index': word, 0: 'count'}).sort_values(by='count', ascending=False)
film_word_counts['all_films'] = pd.DataFrame.from_dict(total_count, orient='index').reset_index().rename(columns={'index': word, 0: 'count'}).sort_values(by='count', ascending=False)

In [18]:
film_word_counts['all_films'].to_csv(os.path.join(SCRIPT_PATH, '..', 'Outputs', 'word_counts.csv'), index=False)

In [19]:
df = pd.DataFrame()

for film in scripts['film'].unique():
  counts = film_word_counts[film]
  counts.insert(1, 'film', film)
  df = pd.concat([df, counts])

In [20]:
df.rename(columns={'no': 'word'}).sort_values(by='count', ascending=False).to_csv('word_counts_by_movie.csv', index=False)