In [1]:
# Deep Neural Nets and Text
## load packages
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# IMPORTANT NOTE
# THE ASSOCIATED GOOGLE COLAB FILE LIVES HERE:
# https://github.com/lrnbeard/Content-Analysis-2020/blob/master/final_deepLearning_170320.ipynb

In [None]:
# start with loading dataset and pre-processing it. 
# pre-processing follows similar steps as done in the past, 
# use pre-written modules offered by the transformers package

In [2]:
%matplotlib inline

In [3]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
import lucem_illud_2020

In [5]:
from transformers import pipeline

In [6]:
# Allocate a pipeline for sentiment-analysis
nlp_sentiment = pipeline('sentiment-analysis')

HBox(children=(IntProgress(value=0, description='Downloading', max=230, style=ProgressStyle(description_width=…




In [7]:
# read in overall dataframe
movie_df = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df.csv") 

In [8]:
# read in dataframes separated by year
movie_df_1940_1960_n75 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_1940_1960_n75.csv")
movie_df_2000_2020_n75 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_2000_2020_n75.csv")

In [9]:
# Text Generation using BERT (using the built in generate function)
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
model_gpt = AutoModelWithLMHead.from_pretrained("gpt2")

In [10]:
# example generation
sequence = "Nothing that we like to do more than analyse data all day long and"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

Nothing that we like to do more than analyse data all day long and then try to figure out what's going on.

"We're not going to be able to do that. We're not going to be able to do that.!


In [11]:
### 1940-1960 ###
### in these next section sections we will train two models
### one to talk like 1940-1960 and the other to talk like 2000-2020
### we can then compare how violence is understood between the two
### and compare sentiment scores

In [12]:
from sklearn.model_selection import train_test_split
train_text_1940_1960_n75, test_text_1940_1960_n75 = train_test_split(movie_df_1940_1960_n75['Text'], test_size=0.2)

In [13]:
train_text_1940_1960_n75.head()

64    @@4046500 This is Sans Souci . It means " With...
67    @@6333252 Oh , dear , oh , dear . - Good night...
61    @@3611996 [?] How many times am I going to tel...
63    @@5263092 My name is Allison MacKenzie . Where...
25    @@6458656 [Male_Chorus] [Fades] Drums , BugleF...
Name: Text, dtype: object

In [14]:
# save to csv
train_text_1940_1960_n75.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/train_text_1940_1960_n75', header=None, index=None, sep=' ', mode='a')

In [15]:
# save to csv
test_text_1940_1960_n75.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/test_text_1940_1960_n75', header=None, index=None, sep=' ', mode='a')

In [25]:
# reference: https://github.com/lrnbeard/Content-Analysis-2020/blob/master/final_deepLearning_170320.ipynb
#tokenizer_1940_1960_n75 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_1940_1960_n75/")
#model_1940_1960_n75 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_1940_1960_n75/")

In [21]:
# reference: https://github.com/lrnbeard/Content-Analysis-2020/blob/master/final_deepLearning_170320.ipynb
tokenizer_1940_1960_n75 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_1940_FINAL/")
model_1940_1960_n75 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_1940_FINAL/")

In [67]:
# here we start text generation -- see violence is bad, guns are not allowed, etc
sequence = "The definition of violence is"

input = tokenizer_1940_1960_n75.encode(sequence, return_tensors="pt")
generated = model_1940_1960_n75.generate(input, max_length=41, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_1940_1960_n75.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is to kill. - What's the use of killing? - I don't know. I don't know what it means to kill. - What's the use of killing?"


In [71]:
sequence = "Guns are"

input = tokenizer_1940_1960_n75.encode(sequence, return_tensors="pt")
generated = model_1940_1960_n75.generate(input, max_length=17, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_1940_1960_n75.decode(generated.tolist()[0])
print(resulting_string)

Guns are not allowed in the house. - No, no, no."


In [73]:
sequence = "Fuck who?"

input = tokenizer_1940_1960_n75.encode(sequence, return_tensors="pt")
generated = model_1940_1960_n75.generate(input, max_length=9, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_1940_1960_n75.decode(generated.tolist()[0])
print(resulting_string)

Fuck who? - I'm sorry."


In [None]:
# note that all of these sentiment scores are very negative

In [65]:
nlp_sentiment("The definition of violence is to kill. - What's the use of killing? - I don't know. I don't know what it means to kill.")

[{'label': 'NEGATIVE', 'score': 0.9986436}]

In [68]:
nlp_sentiment("Guns are not allowed in the house. - No, no, no.")

[{'label': 'NEGATIVE', 'score': 0.99901944}]

In [69]:
nlp_sentiment("Fuck who? - I'm sorry.")

[{'label': 'NEGATIVE', 'score': 0.999546}]

In [None]:
### 2000-2020 ###

In [16]:
from sklearn.model_selection import train_test_split
train_text_2000_2020_n75, test_text_2000_2020_n75 = train_test_split(movie_df_2000_2020_n75['Text'], test_size=0.2)

In [17]:
train_text_2000_2020_n75.head()

29    @@6093769 What's his name ? - Bear . - Are you...
7     @@3313049 Good for you , Pete . Good for you ....
17    @@5867452 ( @baby_cries ) ( @baby_cries ) ( CA...
9     @@6759672 Do you think you can know something ...
48    @@4553869 Turn over to the right . You're squi...
Name: Text, dtype: object

In [18]:
train_text_2000_2020_n75.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/train_text_2000_2020_n75', header=None, index=None, sep=' ', mode='a')

In [19]:
test_text_2000_2020_n75.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/test_text_2000_2020_n75', header=None, index=None, sep=' ', mode='a')

In [34]:
# reference: https://github.com/lrnbeard/Content-Analysis-2020/blob/master/final_deepLearning_170320.ipynb
#tokenizer_2000_2020_n75 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_2000_2020_n75/")
#model_2000_2020_n75 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_2000_2020_n75")

In [22]:
# reference: https://github.com/lrnbeard/Content-Analysis-2020/blob/master/final_deepLearning_170320.ipynb
tokenizer_2000_2020_n75 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_2000_FINAL/")
model_2000_2020_n75 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_2000_FINAL")

In [55]:
# violence is much more liberally spoken about here!
sequence = "The definition of violence is"

input = tokenizer_2000_2020_n75.encode(sequence, return_tensors="pt")
generated = model_2000_2020_n75.generate(input, max_length=13, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_2000_2020_n75.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is that it's a way of life"


In [64]:
sequence = "Guns are"

input = tokenizer_2000_2020_n75.encode(sequence, return_tensors="pt")
generated = model_2000_2020_n75.generate(input, max_length=14, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_2000_2020_n75.decode(generated.tolist()[0])
print(resulting_string)

Guns are not the only thing that makes you feel good."


In [61]:
sequence = "Fuck who?"

input = tokenizer_2000_2020_n75.encode(sequence, return_tensors="pt")
generated = model_2000_2020_n75.generate(input, max_length=16, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_2000_2020_n75.decode(generated.tolist()[0])
print(resulting_string)

Fuck who? - I'm not gon na be able to find you."


In [None]:
# note that not only was violence spoken about more liberally 
# but the following sentiment scores are more negative (except for the last)
# but the context is still quite diff from the 1940-1960 example

In [52]:
nlp_sentiment("The definition of violence is that it's a way of life.")

[{'label': 'POSITIVE', 'score': 0.70927584}]

In [56]:
nlp_sentiment("Guns are not the only thing that makes you feel good.")

[{'label': 'POSITIVE', 'score': 0.99196535}]

In [57]:
nlp_sentiment("Fuck who? - I'm not gon na be able to find you.")

[{'label': 'NEGATIVE', 'score': 0.99931544}]