# B. Basic Processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib

In [2]:
data_home = 'C:/Users/user/Desktop/24Spring_ETA_Project/Dataset'
data_prefix = 'Star Wars- Episode '
output_dir = 'C:/Users/user/Desktop/24Spring_ETA_Project/Output'
path_prefix = 'starwars-combo'

ep1 = f"{data_home}/{data_prefix}I - The Phantom Menace (1999).rtf"
ep2 = f"{data_home}/{data_prefix}II - Attack of the Clones (2002).rtf"
ep3 = f"{data_home}/{data_prefix}III - Revenge of the Sith (2005).rtf"
ep4 = f"{data_home}/{data_prefix}IV - A New Hope (1977).rtf"
ep5 = f"{data_home}/{data_prefix}V - The Empire Strikes Back (1980).rtf"
ep6 = f"{data_home}/{data_prefix}VI - Return of the Jedi (1983).rtf"

In [3]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

In [4]:
class chunks:
    def __init__(self, episode):
        self.episode = episode

    def getlines(self):
        LINES = pd.DataFrame(open(self.episode, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
        LINES.index.name = 'line_num'
        LINES.line_str = LINES.line_str.str.replace(r'\\\n+', ' ', regex=True).str.strip()
        if self.episode == ep1: #start from actual content to the end
            LINES = LINES.loc[11 :]
        elif self.episode == ep2:
            LINES = LINES.loc[10 :]
        elif self.episode == ep3:
            LINES = LINES.loc[7 :]
        elif self.episode == ep4:
            LINES = LINES.loc[12 :]
        elif self.episode == ep5:
            LINES = LINES.loc[11 :]
        elif self.episode == ep6:
            LINES = LINES.loc[11 :]
        return LINES

    
    def chap(self, LINES):
        if self.episode == ep4:
            chap_pat = r"^\s*(INTERIOR\:|EXTERIOR\:)"
        elif self.episode == ep3:
            chap_pat = r"^\s*\d+\s+(INT\.|EXT\.)"
        elif self.episode == ep6:
            chap_pat = r"^\s*\d+\s+(INT|EXT)"
            # 11(tab)INT JABBA'S THRONE ROOM(tab)(tab)(tab)(tab)11
        else:
            chap_pat = r"^\s*(INT\.|EXT\.)"

        chap_lines = LINES.line_str.str.match(chap_pat, case=False)
        LINES.loc[chap_lines]
        LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
        LINES.loc[chap_lines]
    
        LINES.chap_num = LINES.chap_num.ffill()
        LINES['chap_num'] = LINES['chap_num'].fillna(1) # Don't Remove everything before Chapter 1
    
        OHCO[:1]
        CHAPS = LINES.groupby(OHCO[:1])\
            .line_str.apply(lambda x: '\n'.join(x))\
            .to_frame('chap_str')
        CHAPS['chap_str'] = CHAPS.chap_str.str.strip()
        return CHAPS

    def par(self, CHAPS):
        para_pat = r'\n\n+'
    
        PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
            .to_frame('para_str').sort_index()
        PARAS.index.names = OHCO[:2]
        PARAS.head()
    
        PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
        PARAS['para_str'] = PARAS['para_str'].str.strip()
        PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        return PARAS
    
    def sent(self, PARAS):
        sent_pat = r'[.?!;:]+'
        SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
            .to_frame('sent_str')
        SENTS.index.names = OHCO[:3]
    
        SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        SENTS.sent_str = SENTS.sent_str.str.strip() # CRUCIAL TO REMOVE BLANK TOKENS
    
        return SENTS
    
    def token(self, SENTS):
        token_pat = r"[\s',-]+"
        # SENTS_new = SENTS['sent_str'].str.replace('-', '', regex=True)
        TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
            .to_frame('token_str')
        
        TOKENS.index.names = OHCO[:4]
        TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()

        TOKENS = TOKENS[~TOKENS['term_str'].str.isnumeric()]
        
        return TOKENS
    
    def vocab(self, TOKENS):
        VOCAB = TOKENS.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
        VOCAB.index.name = 'term_id'
        return VOCAB

    def process(self):
        episode_chunk = chunks(self.episode)
        ep_lines = episode_chunk.getlines()
        ep_chaps = episode_chunk.chap(ep_lines)
        ep_pars = episode_chunk.par(ep_chaps)
        ep_sents = episode_chunk.sent(ep_pars)
        ep_tokens = episode_chunk.token(ep_sents)
        ep_vocab = episode_chunk.vocab(ep_tokens)
        return {
            'lines': ep_lines,
            'chapters': ep_chaps,
            'paragraphs': ep_pars,
            'sentences': ep_sents,
            'tokens': ep_tokens,
            'vocabulary': ep_vocab
        }

In [5]:
episode_files = [ep1, ep2, ep3, ep4, ep5, ep6]

# Dictionary to hold all processed data by episode
episode_data = {}

# Loop through each episode file
for i, episode_file in enumerate(episode_files, start=1):
    episode_chunk = chunks(episode_file)
    processed = episode_chunk.process()  # This should return a dictionary of all stages
    
    episode_data[f'ep{i}'] = {
        'lines': processed["lines"],
        'chapters': processed["chapters"],
        'paragraphs': processed["paragraphs"],
        'sentences': processed["sentences"],
        'tokens': processed["tokens"],
        'vocabulary': processed["vocabulary"]
    }

In [6]:
ep1_token = episode_data['ep1']['tokens']
ep1_token.to_csv(f"{output_dir}/basic/ep1-TOKEN.csv")
ep2_token = episode_data['ep2']['tokens']
ep2_token.to_csv(f"{output_dir}/basic/ep2-TOKEN.csv")
ep3_token = episode_data['ep3']['tokens']
ep3_token.to_csv(f"{output_dir}/basic/ep3-TOKEN.csv")
ep4_token = episode_data['ep4']['tokens']
ep4_token.to_csv(f"{output_dir}/basic/ep4-TOKEN.csv")
ep5_token = episode_data['ep5']['tokens']
ep5_token.to_csv(f"{output_dir}/basic/ep5-TOKEN.csv")
ep6_token = episode_data['ep6']['tokens']
ep6_token.to_csv(f"{output_dir}/basic/ep6-TOKEN.csv")

## Save tokens as outputs

In [7]:
ep1_token = episode_data['ep1']['tokens']
ep1_token.to_csv(f"{output_dir}/basic/ep1-TOKEN.csv")
ep2_token = episode_data['ep2']['tokens']
ep2_token.to_csv(f"{output_dir}/basic/ep2-TOKEN.csv")
ep3_token = episode_data['ep3']['tokens']
ep3_token.to_csv(f"{output_dir}/basic/ep3-TOKEN.csv")
ep4_token = episode_data['ep4']['tokens']
ep4_token.to_csv(f"{output_dir}/basic/ep4-TOKEN.csv")
ep5_token = episode_data['ep5']['tokens']
ep5_token.to_csv(f"{output_dir}/basic/ep5-TOKEN.csv")
ep6_token = episode_data['ep6']['tokens']
ep6_token.to_csv(f"{output_dir}/basic/ep6-TOKEN.csv")

## Save vocabs as outputs

In [8]:
ep1_vocab = episode_data['ep1']['vocabulary']
ep1_vocab.to_csv(f"{output_dir}/basic/ep1-VOCAB.csv")
ep2_vocab = episode_data['ep2']['vocabulary']
ep2_vocab.to_csv(f"{output_dir}/basic/ep2-VOCAB.csv")
ep3_vocab = episode_data['ep3']['vocabulary']
ep3_vocab.to_csv(f"{output_dir}/basic/ep3-VOCAB.csv")
ep4_vocab = episode_data['ep4']['vocabulary']
ep4_vocab.to_csv(f"{output_dir}/basic/ep4-VOCAB.csv")
ep5_vocab = episode_data['ep5']['vocabulary']
ep5_vocab.to_csv(f"{output_dir}/basic/ep5-VOCAB.csv")
ep6_vocab = episode_data['ep6']['vocabulary']
ep6_vocab.to_csv(f"{output_dir}/basic/ep6-VOCAB.csv")

## Save sent as outputs

In [9]:
ep1_sent = episode_data['ep1']['sentences']
ep1_sent.to_csv(f"{output_dir}/basic/ep1-SENT.csv")
ep2_sent = episode_data['ep2']['sentences']
ep2_sent.to_csv(f"{output_dir}/basic/ep2-SENT.csv")
ep3_sent = episode_data['ep3']['sentences']
ep3_sent.to_csv(f"{output_dir}/basic/ep3-SENT.csv")
ep4_sent = episode_data['ep4']['sentences']
ep4_sent.to_csv(f"{output_dir}/basic/ep4-SENT.csv")
ep5_sent = episode_data['ep5']['sentences']
ep5_sent.to_csv(f"{output_dir}/basic/ep5-SENT.csv")
ep6_sent = episode_data['ep6']['sentences']
ep6_sent.to_csv(f"{output_dir}/basic/ep6-SENT.csv")