In [1]:
import pandas as pd
import re
import string 

## Load up my data from both scripts

In [2]:
end_game_data = pd.read_csv("endgame_script.txt", sep = "|", )
inf_war_data = pd.read_csv("infinity_war_script.txt", sep = "|", )

## Need to clean up data, merge character lines together 
i.e "Bruce" is same as  "Bruce Banner"  

#### Cleaning END GAME Data

In [3]:
end_game_data.loc[end_game_data["character"].str.contains('TONY', regex = False),"character"]

24                  TONY STARK
26                  TONY STARK
27                  TONY STARK
28                  TONY STARK
29                  TONY STARK
                 ...          
1139                TONY STARK
1149                TONY STARK
1153                TONY STARK
1157    TONY STARK (voiceover)
1158     TONY STARK (hologram)
Name: character, Length: 183, dtype: object

As shown above, there are different character names of "Tony Stark" even though it is the same character speaking

In [278]:
""" change the name of characters"""
def change_names(in_df, init_names, new_names):
    for i in range(len(init_names)):
        initial_name = init_names[i]
        new_name = new_names[i]
        in_df.loc[in_df["character"].str.upper().str.contains(initial_name, regex = False),"character"] = new_name
    return in_df

In [5]:
end_game_char_names = ["TONY", "STEVE", "NATASHA", "NEBULA", "SCOTT LANG", 
                            "BRUCE BANNER", "CLINT", "ALEXANDER PEIRCE",
                           "F.R.I.D.A.Y", "RHODEY", "THANOS", "THOR", "GAMORA"]

MCU_NAMES = ["TONY STARK", "STEVE ROGERS", "NATASHA ROMANOFF", "NEBULA", "SCOTT LANG",
             "BRUCE BANNER", "CLINT BARTON", "ALEXANDER PEIRCE", "F.R.I.D.A.Y.", "RHODEY", 
            "THANOS", "THOR", "GAMORA"]




for i in range(len(end_game_char_names)):
    initial_name = end_game_char_names[i]
    new_name = MCU_NAMES[i]
    end_game_data.loc[end_game_data["character"].str.upper().str.contains(initial_name, regex = False),"character"] = new_name


end_game_data = change_names(end_game_data, end_game_char_names, MCU_NAMES)




In [6]:
end_game_data.character.replace(regex=True,
                                inplace=True,
                                to_replace=r'[(\d\d\d\d)]',value=r'')


In [7]:
#SPECIAL CASES FOR BRUCE BANNER
end_game_data.loc[end_game_data["character"].str.upper().str.contains('MEM', regex = False), 'character'] = "BRUCE BANNER"
end_game_data.loc[end_game_data["character"].str.upper().str.contains('^BRUCE$', regex = True), 'character'] = "BRUCE BANNER"


#### Cleaning Infinity War Data

In [8]:
inf_war_data['character'] = inf_war_data['character'].str.upper() 

inf_war_char = ["F.R.I.D.A.Y", "STRANGE"]
MCU_NAMES_INF_WAR = ["F.R.I.D.A.Y.", "DOCTOR STRANGE"]

inf_war_data = change_names(inf_war_data, inf_war_char, MCU_NAMES_INF_WAR)


## Data Table of characters and number of lines they have

In [9]:
end_line_count = end_game_data.groupby("character")["line"].count()
end_line_count = end_line_count.to_frame()
end_line_count.reset_index()
end_line_count.to_csv("end_game_line_count.txt", index = True, header = True)


In [10]:
inf_line_count = inf_war_data.groupby("character")["line"].count()
inf_line_count = inf_line_count.to_frame()
inf_line_count.reset_index()
inf_line_count.to_csv("infinity_war_line_count.txt", index = True, header = True)

## Outputting cleaned_script

In [11]:
end_game_data.to_csv("end_game_script_clean.txt", index = True, header = True)
inf_war_data.to_csv("infinity_war_script_clean.txt", index = True, header = True)

## Data table of characters and the words they said

In [214]:
""" Given data frame, return another dataframe with just character names and the words they use and word counts"""
def char_names_word_count(df):
    df_words = df[["character", "line"]]

    df_clean = df["line"].str.replace(r"[,.;@#?!&$]+\ *", " ", regex = True)
    lower_words =  df_clean.str.lower()
    lower_words = lower_words.str.strip()
    df_words["words"] = lower_words.str.split()
    df_words = df_words.drop(columns = ["line"])
    
    new_df_words = pd.DataFrame(df_words.words.tolist(), index=df_words.character).stack()
    new_df_words = new_df_words.reset_index([0, 'character'])
    new_df_words.columns = ['character', 'word_spoken']
    
    new_char_group = pd.DataFrame(pd.np.empty((0, 3)))    

    new_char_group = new_df_words.groupby(['character','word_spoken']).size()
    new_char_group = new_char_group.reset_index()
    new_char_group.columns = ['character', 'word', 'word_count']

    return(new_char_group)

In [270]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

""" Given data frame, return another dataframe with just character names and the words they use and word counts
THIS VERSION OF THE FUNCTION REMOVES STOP WORDS FROM DIALOGUE"""
def char_names_word_count_no_stop(df):
    df_words = df[["character", "line"]]

    df_clean = df["line"].str.replace(r"[,.;@#?!&$]+\ *", " ", regex = True)
    lower_words =  df_clean.str.lower()
    lower_words = lower_words.str.strip()
    df_words["words"] = lower_words.str.split()
    df_words["words"] = df_words["words"].apply(lambda x: [item for item in x if item not in stop])

    
    
    df_words = df_words.drop(columns = ["line"])
    
    new_df_words = pd.DataFrame(df_words.words.tolist(), index=df_words.character).stack()
    new_df_words = new_df_words.reset_index([0, 'character'])
    new_df_words.columns = ['character', 'word_spoken']
    
    new_char_group = pd.DataFrame(pd.np.empty((0, 3)))    

    new_char_group = new_df_words.groupby(['character','word_spoken']).size()
    new_char_group = new_char_group.reset_index()
    new_char_group.columns = ['character', 'word', 'word_count']

    return(new_char_group)

## Get the word count table into a csv file
## AND store data table locally

In [281]:
inf_word_count = char_names_word_count_no_stop(inf_war_data)
inf_word_count.to_csv("infinity_war_word_count.csv", header = True)


end_word_count = char_names_word_count_no_stop(end_game_data)
end_word_count.to_csv("end_game_word_count.csv", header = True)

# Top 5 most frequent words for Tony Stark
### Avengers: Infinity War

In [297]:
inf_word_count.loc[inf_word_count['character'] == "TONY STARK"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
2647,TONY STARK,i'm,17
2864,TONY STARK,yeah,11
2657,TONY STARK,know,9
2609,TONY STARK,get,9
2633,TONY STARK,he's,9


### Avengers: End Game

In [292]:
end_word_count.loc[end_word_count['character'] == "TONY STARK"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
3409,TONY STARK,right,19
3175,TONY STARK,got,18
3251,TONY STARK,know,17
3583,TONY STARK,yeah,16
3502,TONY STARK,that's,15


# Top 5 most frequent words for Steve Rogers
### Avengers: Infinity War

In [293]:
inf_word_count.loc[inf_word_count['character'] == "STEVE ROGERS"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
1884,STEVE ROGERS,go,4
1883,STEVE ROGERS,get,3
1893,STEVE ROGERS,i'm,3
1930,STEVE ROGERS,vision,2
1880,STEVE ROGERS,fight,2


### Avengers: End Game

In [294]:
end_word_count.loc[end_word_count['character'] == "STEVE ROGERS"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
2276,STEVE ROGERS,know,15
2238,STEVE ROGERS,get,12
2396,STEVE ROGERS,tony,11
2433,STEVE ROGERS,yeah,8
2371,STEVE ROGERS,stones,8


# Top 5 most frequent words for Thanos
### Avengers: Infinity War

In [295]:
inf_word_count.loc[inf_word_count['character'] == "THANOS"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
2105,THANOS,one,8
2163,THANOS,stone,5
2073,THANOS,know,4
2008,THANOS,daughter,4
2202,THANOS,what's,4


### Avengers: End Game

In [296]:
end_word_count.loc[end_word_count['character'] == "THANOS"].sort_values(by = ['word_count']
                                                                        , ascending = False).head(5)

Unnamed: 0,character,word,word_count
2545,THANOS,stones,7
2492,THANOS,i'm,5
2470,THANOS,done,4
2500,THANOS,know,4
2496,THANOS,inevitable,3
