In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [7]:
%matplotlib inline

In [8]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.sequence import pad_sequences

In [9]:
import lucem_illud_2020

In [24]:
from transformers import pipeline

In [25]:
# Allocate a pipeline for sentiment-analysis
nlp_sentiment = pipeline('sentiment-analysis')

HBox(children=(IntProgress(value=0, description='Downloading', max=230, style=ProgressStyle(description_width=…




In [10]:
# read in overall dataframe
movie_df = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df.csv") 

In [11]:
# read in dataframes separated by year
movie_df_1940_2020 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_1940_2020.csv") 
movie_df_1940_1960 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_1940_1960.csv")
movie_df_1960_1980 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_1960_1980.csv")
movie_df_1980_2000 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_1980_2000.csv")
movie_df_2000_2020 = pd.read_csv("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/dataframes/movie_df_2000_2020.csv")

In [12]:
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
model_gpt = AutoModelWithLMHead.from_pretrained("gpt2")

In [13]:
sequence = "Nothing that we like to do more than analyse data all day long and"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

Nothing that we like to do more than analyse data all day long and then try to figure out what's going on.

"We're not going to be able to do that. We're not going to be able to do that.!


In [14]:
### 1940-1960 ###

In [15]:
from sklearn.model_selection import train_test_split
train_text_1940_1960, test_text_1940_1960 = train_test_split(movie_df_1940_1960['Text'], test_size=0.2)

In [16]:
train_text_1940_1960.head()

74    @@4423653 At 4 .. 00 a.m . on Friday , May 10 ...
76    @@3611996 [?] How many times am I going to tel...
59    @@6520352 Subtitles by Seglora How do you do M...
0     @@6850720 You must do your best tonight Be on ...
5     @@5089097 Extra ! Read all about it ! Again a ...
Name: Text, dtype: object

In [17]:
train_text_1940_1960.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/train_text_1940_1960', header=None, index=None, sep=' ', mode='a')

In [18]:
test_text_1940_1960.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/test_text_1940_1960', header=None, index=None, sep=' ', mode='a')

In [28]:
tokenizer_1940_1960 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_1940_1960/")
model_1940_1960 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_1940_1960/")

In [38]:
sequence = "The definition of violence is"

input = tokenizer_1940_1960.encode(sequence, return_tensors="pt")
generated = model_1940_1960.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_1940_1960.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is the act of violence against another person. It is the act of violence against another person. It is the act of violence against another person. It is the act of violence against another person. It is the act of violence"


In [43]:
nlp_sentiment("The definition of violence is the act of violence against another person.")

[{'label': 'NEGATIVE', 'score': 0.99191463}]

In [39]:
sequence = "The definition of violence is"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is not a matter of violence, but of violence that is not directed at the victim.

The definition of violence is not a matter of violence, but of violence that is not directed at the victim. The definition of"


In [42]:
nlp_sentiment("The definition of violence is not a matter of violence, but of violence that is not directed at the victim.")

[{'label': 'POSITIVE', 'score': 0.9665815}]

In [48]:
sequence = "Weapons are"

input = tokenizer_1940_1960.encode(sequence, return_tensors="pt")
generated = model_1940_1960.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_1940_1960.decode(generated.tolist()[0])
print(resulting_string)

Weapons are not allowed in the barracks. - I'll be back. - I'll be back. - I'll be back. - I'll be back. - I'll be back. - I'll be back. - I'll be back"


In [49]:
nlp_sentiment("Weapons are not allowed in the barracks..")

[{'label': 'NEGATIVE', 'score': 0.9893867}]

In [46]:
sequence = "Weapons are"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

Weapons are not available.

The following weapons are not available.

Weapon Name Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type"


In [47]:
nlp_sentiment("Weapons are not available.")

[{'label': 'NEGATIVE', 'score': 0.99960166}]

In [19]:
### 2000-2020 ###

In [20]:
from sklearn.model_selection import train_test_split
train_text_2000_2020, test_text_2000_2020 = train_test_split(movie_df_2000_2020['Text'], test_size=0.2)

In [21]:
train_text_2000_2020.head()

414    @@6822898 ? ? ? [INAUDIBLE_DIALOGUE] One of my...
312    @@6317249 A century ago , these fields were th...
382    @@6127842 Ana , take my car . I gave you the r...
237    @@3737502 RODRICK : ( WHISPERING ) Greg ? GREG...
82     @@4239336 Spartacus ( 2004 ) - - 25 fps - - - ...
Name: Text, dtype: object

In [22]:
train_text_2000_2020.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/train_text_2000_2020', header=None, index=None, sep=' ', mode='a')

In [23]:
test_text_2000_2020.to_frame().to_csv(r'/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/test_text_2000_2020', header=None, index=None, sep=' ', mode='a')

In [32]:
tokenizer_2000_2020 = AutoTokenizer.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_2000_2020/")
model_2000_2020 = AutoModelWithLMHead.from_pretrained("/Users/laurenbeard/Desktop/compContentAnalysis/Content-Analysis-2020/final/output/output_gpt_2000_2020")

In [36]:
sequence = "The definition of violence is"

input = tokenizer_2000_2020.encode(sequence, return_tensors="pt")
generated = model_2000_2020.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_2000_2020.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is that it's a violent act. It's a violent act. It's a violent act. It's a violent act. It's a violent act. It's a violent act. It's a violent act. It"


In [40]:
nlp_sentiment("The definition of violence is that it's a violent act.")

[{'label': 'NEGATIVE', 'score': 0.99274975}]

In [37]:
sequence = "The definition of violence is"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

The definition of violence is not a matter of violence, but of violence that is not directed at the victim.

The definition of violence is not a matter of violence, but of violence that is not directed at the victim. The definition of"


In [41]:
nlp_sentiment("The definition of violence is not a matter of violence, but of violence that is not directed at the victim.")

[{'label': 'POSITIVE', 'score': 0.9665815}]

In [50]:
sequence = "Weapons are"

input = tokenizer_2000_2020.encode(sequence, return_tensors="pt")
generated = model_2000_2020.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_2000_2020.decode(generated.tolist()[0])
print(resulting_string)

Weapons are not allowed. - You're not allowed to use them. - No. - No. - No. - No. - No. - No. - No. - No. - No. - No. - No. - No"


In [54]:
nlp_sentiment("Weapons are not allowed. - You're not allowed to use them.")

[{'label': 'NEGATIVE', 'score': 0.9979433}]

In [52]:
sequence = "Weapons are"

input = tokenizer_gpt.encode(sequence, return_tensors="pt")
generated = model_gpt.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
print(resulting_string)

Weapons are not available.

The following weapons are not available.

Weapon Name Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type Damage Type"


In [55]:
nlp_sentiment("Weapons are not available.")

[{'label': 'NEGATIVE', 'score': 0.99960166}]