# D. CORPUS

In [1]:
import pandas as pd
import numpy as np
import matplotlib
from glob import glob
import re
import nltk

In [2]:
data_home = 'C:/Users/user/Desktop/24Spring_ETA_Project/Dataset'
data_prefix = 'Star Wars- Episode '
output_dir = 'C:/Users/user/Desktop/24Spring_ETA_Project/Output'
path_prefix = 'starwars-combo'

ep1 = f"{data_home}/{data_prefix}I - The Phantom Menace (1999).rtf"
ep2 = f"{data_home}/{data_prefix}II - Attack of the Clones (2002).rtf"
ep3 = f"{data_home}/{data_prefix}III - Revenge of the Sith (2005).rtf"
ep4 = f"{data_home}/{data_prefix}IV - A New Hope (1977).rtf"
ep5 = f"{data_home}/{data_prefix}V - The Empire Strikes Back (1980).rtf"
ep6 = f"{data_home}/{data_prefix}VI - Return of the Jedi (1983).rtf"

ep1_token = pd.read_csv(f"{output_dir}/basic/ep1-TOKEN.csv")
ep2_token = pd.read_csv(f"{output_dir}/basic/ep2-TOKEN.csv")
ep3_token = pd.read_csv(f"{output_dir}/basic/ep3-TOKEN.csv")
ep4_token = pd.read_csv(f"{output_dir}/basic/ep4-TOKEN.csv")
ep5_token = pd.read_csv(f"{output_dir}/basic/ep5-TOKEN.csv")
ep6_token = pd.read_csv(f"{output_dir}/basic/ep6-TOKEN.csv")

In [3]:
# ep1_token

In [4]:
ep1_token['movie_id'] = 1
ep2_token['movie_id'] = 2
ep3_token['movie_id'] = 3
ep4_token['movie_id'] = 4
ep5_token['movie_id'] = 5
ep6_token['movie_id'] = 6

In [5]:
ep1_token

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str,movie_id
0,1.0,0,0,0,A,a,1
1,1.0,0,0,1,vast,vast,1
2,1.0,0,0,2,sea,sea,1
3,1.0,0,0,3,of,of,1
4,1.0,0,0,4,stars,stars,1
...,...,...,...,...,...,...,...
29429,196.0,4,0,0,IRIS,iris,1
29430,196.0,4,0,1,OUT,out,1
29431,196.0,5,0,0,END,end,1
29432,196.0,5,0,1,TITLES,titles,1


In [6]:
OHCO = ['movie_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [7]:
CORPUS = pd.concat([ep1_token, ep2_token, ep3_token, ep4_token, ep5_token, ep6_token]).dropna().set_index(OHCO)

In [8]:
CORPUS['term_str'] = CORPUS['token_str'].str.lower().str.replace(r'[\W_]', '', regex=True).dropna()
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
movie_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0,0,0,A,a
1,1.0,0,0,1,vast,vast
1,1.0,0,0,2,sea,sea
1,1.0,0,0,3,of,of
1,1.0,0,0,4,stars,stars
...,...,...,...,...,...,...
6,135.0,6,0,2,OVER,over
6,135.0,6,0,3,STAR,star
6,135.0,6,0,4,FIELD,field
6,135.0,7,0,0,THE,the


## Manually changing POS for character names
POS-Tagger does not have enough words, not precisely capturing character names.\
Therefore, I am manually adding some character names as NNP using the top 5 characters of each characters (explored in A. Data Exploration) and some additional character names

In [9]:
from nltk import PerceptronTagger
tagger = PerceptronTagger()

tagger.tagdict['luke'] = 'NNP'
tagger.tagdict['qui'] = 'NNP'
tagger.tagdict['gon'] = 'NNP'
tagger.tagdict['anakin'] = 'NNP'
tagger.tagdict['jar'] = 'NNP'
tagger.tagdict['obi'] = 'NNP'
tagger.tagdict['wan'] = 'NNP'
tagger.tagdict['padme'] = 'NNP'
tagger.tagdict['yoda'] = 'NNP'
tagger.tagdict['mace'] = 'NNP'
tagger.tagdict['windu'] = 'NNP'
tagger.tagdict['palpatine'] = 'NNP'
tagger.tagdict['threepio'] = 'NNP'
tagger.tagdict['3po'] = 'NNP'
tagger.tagdict['han'] = 'NNP'
tagger.tagdict['ben'] = 'NNP'
tagger.tagdict['leia'] = 'NNP'
tagger.tagdict['lando'] = 'NNP'
tagger.tagdict['vader'] = 'NNP'
tagger.tagdict['darth'] = 'NNP'
tagger.tagdict['skywalker'] = 'NNP'
tagger.tagdict['chewbacca'] = 'NNP'
tagger.tagdict['kenobi'] = 'NNP'
tagger.tagdict['boba'] = 'NNP'
tagger.tagdict['fett'] = 'NNP'
tagger.tagdict['dooku'] = 'NNP'
tagger.tagdict['jabba'] = 'NNP'
tagger.tagdict['lando'] = 'NNP'
tagger.tagdict['artoo'] = 'NNP'
tagger.tagdict['don'] = 'NNP'
tagger.tagdict['chewie'] = 'NNP'
tagger.tagdict['biggs'] = 'NNP'
tagger.tagdict['wedge'] = 'NNP'
# tagger.tagdict[''] = 'NNP'

## Tokenize & Pos_tag

In [10]:
def tokenize_and_postag(term):
    pos_tags = nltk.pos_tag([term])
    return pos_tags[0]

# Apply the function to the 'token_str' column and create a new 'pos_tags' column
CORPUS['pos_tags'] = CORPUS['term_str'].apply(tokenize_and_postag)

CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tags
movie_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.0,0,0,0,A,a,"(a, DT)"
1,1.0,0,0,1,vast,vast,"(vast, NN)"
1,1.0,0,0,2,sea,sea,"(sea, NN)"
1,1.0,0,0,3,of,of,"(of, IN)"
1,1.0,0,0,4,stars,stars,"(stars, NNS)"
...,...,...,...,...,...,...,...
6,135.0,6,0,2,OVER,over,"(over, IN)"
6,135.0,6,0,3,STAR,star,"(star, NN)"
6,135.0,6,0,4,FIELD,field,"(field, NN)"
6,135.0,7,0,0,THE,the,"(the, DT)"


In [11]:
CORPUS['pos'] = CORPUS.pos_tags.apply(lambda x: x[1])
CORPUS['term_str'] = CORPUS.pos_tags.apply(lambda x: x[0])

In [12]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tags,pos,pos_group
movie_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,0,0,0,A,a,"(a, DT)",DT,DT
1,1.0,0,0,1,vast,vast,"(vast, NN)",NN,NN
1,1.0,0,0,2,sea,sea,"(sea, NN)",NN,NN
1,1.0,0,0,3,of,of,"(of, IN)",IN,IN
1,1.0,0,0,4,stars,stars,"(stars, NNS)",NNS,NN
...,...,...,...,...,...,...,...,...,...
6,135.0,6,0,2,OVER,over,"(over, IN)",IN,IN
6,135.0,6,0,3,STAR,star,"(star, NN)",NN,NN
6,135.0,6,0,4,FIELD,field,"(field, NN)",NN,NN
6,135.0,7,0,0,THE,the,"(the, DT)",DT,DT


In [14]:
CORPUS[['term_str','pos_group']].value_counts().sort_index().loc['jedi']

pos_group
NN    580
Name: count, dtype: int64

In [15]:
CORPUS[['term_str','pos']].value_counts().sort_index().loc['force']

pos
NN    163
Name: count, dtype: int64

In [16]:
CORPUS[['term_str','pos']].value_counts().sort_index().loc['luke']

pos
NNP    1528
Name: count, dtype: int64

# Save

In [17]:
CORPUS.to_csv(f"{output_dir}/{path_prefix}-CORPUS.csv", index=True)