In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# URL of the IMDb full credits page
url = 'https://www.imdb.com/title/tt0386676/fullcredits/?ref_=tt_cl_sm'

# Send a GET request to the IMDb page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the 'Series Cast' section
    cast_list = soup.find('table', class_='cast_list')

    # Lists to store the scraped data
    actors, full_info = [], []

    # Extract data for all columns
    for row in cast_list.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) > 3:
            # Append actor name
            actors.append(cols[1].get_text(strip=True))
            # Append full character info
            full_info.append(cols[3].get_text(strip=True))

    # Create a DataFrame with the initial data
    df = pd.DataFrame({
        'Actor': actors,
        'FullInfo': full_info
    })

    # Define a regular expression pattern to extract the character name, episode count, and years
    pattern = r'(?P<Character>[^\d/]+?)(?:/ ...)?\s*(?P<Episodes_Count>\d+)?\s*episodes?,?\s*(?P<Years>\d{4}(?:-\d{4})?)?(?:\(uncredited\))?'
    
    # Extract the pattern into new columns
    df_extracted = df['FullInfo'].str.extract(pattern)

    # Concatenate the new columns to the original dataframe
    df_final = pd.concat([df['Actor'], df_extracted], axis=1)

    # Define the path for the CSV file
    csv_file_path = 'C:\\Users\\Lado\\Google Drive\\SocialGraphs\\Untitled Folder\\cast_details.csv'

    # Save the DataFrame to a CSV file
    df_final.to_csv(csv_file_path, index=False)
    print(f"CSV file has been saved successfully to {csv_file_path}.")
else:
    print("Failed to retrieve the IMDb page")

CSV file has been saved successfully to C:\Users\Lado\Google Drive\SocialGraphs\Untitled Folder\cast_details.csv.


In [2]:
df = pd.read_csv('C:/Users/Lado/Google Drive/SocialGraphs/Untitled Folder/cast_details.csv')

# Convert 'Episodes_Count' to numeric (in case it's not)
df['Episodes_Count'] = pd.to_numeric(df['Episodes_Count'], errors='coerce')

# Define a function to extract the first name and last name
def split_name(name):
    # Split the name by spaces
    parts = name.split()
    # The first name is the first part
    first_name = parts[0] if parts else ''
    # The last name is the rest of the parts joined, or the second part if there are only two parts
    last_name = ' '.join(parts[1:]) if len(parts) > 2 else parts[1] if len(parts) == 2 else ''
    return first_name, last_name

# Apply the split_name function to the 'Character' column
df['Character_Name'], df['Character_Surname'] = zip(*df['Character'].apply(split_name))

# Display the modified DataFrame
df[['Actor', 'Character_Name', 'Character_Surname', 'Episodes_Count', 'Years']]


# Filter out rows where 'Episodes_Count' is less than 2
df_filtered = df[df['Episodes_Count'] >= 3]
#df_filtered.to_csv('path_to_save_filtered_csv.csv', index=False)
df_filtered

Unnamed: 0,Actor,Character,Episodes_Count,Years,Character_Name,Character_Surname
0,Rainn Wilson,Dwight Schrute,188.0,2005-2013,Dwight,Schrute
1,John Krasinski,Jim Halpert,188.0,2005-2013,Jim,Halpert
2,Jenna Fischer,Pam Beesly,188.0,2005-2013,Pam,Beesly
3,Leslie David Baker,Stanley Hudson,188.0,2005-2013,Stanley,Hudson
4,Brian Baumgartner,Kevin Malone,188.0,2005-2013,Kevin,Malone
...,...,...,...,...,...,...
430,Julia Cho,Asian Woman #,11.0,2013,Asian,Woman #
440,Michael Kaiser,Underage Kid #,11.0,2007,Underage,Kid #
503,Matt Prokop,Underage Kid #,31.0,2007,Underage,Kid #
581,Erica Mer,Blue Shirted Kid #,21.0,2010,Blue,Shirted Kid #


In [3]:
character_names_list = df_filtered['Character_Name'].tolist()
#character_names_list

In [4]:
import os
import re
from collections import defaultdict

def extract_character_dialogues(folder_path):
    # Dictionary to store dialogues for each character
    character_dialogues = defaultdict(list)

    # Iterate over all text files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # Split the content by ':' to separate speakers
                parts = content.split(':')

                # Process each part to separate the character name and dialogue
                for i in range(len(parts) - 1):  # Skip the last part as it won't be followed by another name
                    # Find the last word (character name) in the current part
                    current_part = parts[i].rstrip()
                    last_word_match = re.search(r'\b(\w+)\b\s*$', current_part)
                    if last_word_match:
                        character_name = last_word_match.group(1)
                        # Ensure that the character name is not a number
                        if not character_name.isdigit():
                            # Extract the dialogue, excluding the last word of the current part and the first word of the next part
                            dialogue = parts[i][:-len(character_name)].strip()
                            next_part_first_word_match = re.search(r'^\s*(\w+)', parts[i + 1])
                            if next_part_first_word_match:
                                dialogue = re.sub(r'\s{}\s*$'.format(re.escape(next_part_first_word_match.group(1))), '', dialogue)
                            character_dialogues[character_name].append(dialogue)

    # Create a new folder for character texts if it doesn't exist
    character_text_folder = os.path.join(folder_path, "character_text")
    if not os.path.exists(character_text_folder):
        os.makedirs(character_text_folder)

    # Write each character's dialogues to separate text files
    for character, dialogues in character_dialogues.items():
        character_file_path = os.path.join(character_text_folder, f"{character}.txt")
        with open(character_file_path, 'w', encoding='utf-8') as file:
            file.write(' '.join(dialogues))

    return character_dialogues.keys()

# Example usage
folder_path = "C:/Users/Lado/Google Drive/SocialGraphs/Untitled Folder/episode_transcripts_pt1"  # Replace with your folder path
character_names = extract_character_dialogues(folder_path)
print(character_names)

dict_keys(['Michael', 'Erin', 'Jim', 'Dwight', 'Oscar', 'Angela', 'Pam', 'Donna', 'Gabe', 'Darryl', 'Kelly', 'Kevin', 'Phyllis', 'Creed', 'Ryan', 'Andy', 'Mihael', 'Stanley', 'Hide', 'All', 'Meredith', 'GodMichael', 'lady', 'Glen', 'MichaelMichael', 'hmmAndy', 'Instructor', 'instructor', 'Toby', 'Everyone', 'ingMichael', 'Lawyer', 'custodyLawyer', 'bumMichael', 'Shane', 'Reporter', 'Packer', 'Jo', 'Nick', 'Realtor', 'Wallace', 'Hank', 'wrote', 'this', 'together', 'Luke', 'Both', '25Dwight', 'Mose', 'OkJim', 'guy', 'Angel', 'Salesman', '10Erin', 'Usher', 'Shelby', 'Todd', '10Dwight', 'Man', 'Son', 'Nate', 'Michel', 'SDonna', 'Holly', 'Jan', 'Astrid', 'Woman', 'Helene', 'Carroll', 'relationshipsCarroll', 'Carrol', '29Michael', 'Danny', 'Receptionist', 'Steve', '45Andy', 'tomorrow', '15Pam', 'congregation', 'Pastor', 'member', 'Lady', 'Doug', 'Girl', 'Maw', 'Students', 'Guy', 'MeeMaw', 'Carla', 'Dad', 'driver', 'Radio', '20Erin', 'Cece', 'Daryl', 'TV', '51Dwight', 'Meridith', 'RightKevin'

In [6]:
import os
import re
from collections import defaultdict

def extract_dialogues_by_names(folder_path, names):
    # Dictionary to store dialogues for each name
    dialogues_by_name = defaultdict(str)

    # Iterate over all text files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # For each name, find all dialogues in the content
                for name in names:
                    # Escape name for use in regular expression
                    escaped_name = re.escape(name)
                    # Construct the regular expression pattern for this name
                    pattern = rf"{escaped_name}:(.*?)(?=\w+:|$)"
                    # Find all matches and append to the dialogues list
                    matches = re.findall(pattern, content, re.DOTALL)
                    dialogues_by_name[name] += ' '.join(matches).replace('\n', ' ')

    # Create a new folder for the dialogues if it doesn't exist
    dialogues_folder = os.path.join(folder_path, "name_dialogs")
    if not os.path.exists(dialogues_folder):
        os.makedirs(dialogues_folder)

    # Write each name's dialogues to a separate text file
    for name, dialogues in dialogues_by_name.items():
        dialogue_file_path = os.path.join(dialogues_folder, f"{name}_dialog.txt")
        with open(dialogue_file_path, 'w', encoding='utf-8') as file:
            file.write(dialogues)

    return list(dialogues_by_name.keys())

# Example usage
folder_path = "C:/Users/Lado/Google Drive/SocialGraphs/Untitled Folder/episode_transcripts_pt1"  # Replace with the actual folder path
#names_list = ["Michael", "Oscar", "Pam"]  # Replace with your list of names
extracted_names = extract_dialogues_by_names(folder_path, character_names_list)
print(extracted_names)

['Dwight', 'Jim', 'Pam', 'Stanley', 'Kevin', 'Angela', 'Meredith', 'Phyllis', 'Creed', 'Oscar', 'Ryan', 'Kelly', 'Andy', 'Michael', 'Toby', 'Darryl', 'Erin', 'Gabe', 'Jan', 'David', 'Nellie', 'Roy', 'Karen', 'Robert', 'Bob', 'Hank', 'Pete', 'Clark', 'Nate', 'Holly', 'Todd', 'Calvin', 'Val', 'State', 'Mose', 'Cathy', 'Helene', 'Jo', 'Hidetoshi', 'Josh', 'Charles', 'Carol', 'Madge', 'Hannah', 'Colin,', 'Brian', 'Donna', 'Nick', 'Gino', 'Esther', 'Teri', 'Cecelia', 'Billy', 'Deangelo', 'Gil', 'Stephanie', 'Mr.', 'Jessica', 'Leo', 'Athlead', 'Rolf', 'Isaac', 'The', 'Lynn', 'Isabel', 'Troy', 'Cynthia', 'Justin', 'Katy', 'Lonny', 'A.', 'Matt', 'Dan', 'Hunter', 'Kenny', 'Jordan', 'Tony', 'Irene', 'Jake', 'Ravi', 'Zeke', 'Sasha', 'Elizabeth', 'Gerald', 'Jada', 'Tom', 'Policeman', 'Tall', 'Cop', 'Flower', 'Orderly', 'Asian', 'Underage', 'Blue']


In [7]:
import os

def remove_empty_files(folder_path):
    # Iterate over all text files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            # Check if file is empty by reading its content
            if os.path.getsize(file_path) == 0:
                # If the file is empty, delete it
                os.remove(file_path)
                print(f"Removed empty file: {filename}")

# Example usage
dialogues_folder = "C:/Users/Lado/Google Drive/SocialGraphs/Untitled Folder/episode_transcripts_pt1/name_dialogs"  # Replace with the actual folder path
remove_empty_files(dialogues_folder)

Removed empty file: A._dialog.txt
Removed empty file: Asian_dialog.txt
Removed empty file: Athlead_dialog.txt
Removed empty file: Blue_dialog.txt
Removed empty file: Cecelia_dialog.txt
Removed empty file: Charles_dialog.txt
Removed empty file: Colin,_dialog.txt
Removed empty file: Cop_dialog.txt
Removed empty file: Cynthia_dialog.txt
Removed empty file: Dan_dialog.txt
Removed empty file: Elizabeth_dialog.txt
Removed empty file: Flower_dialog.txt
Removed empty file: Gerald_dialog.txt
Removed empty file: Gil_dialog.txt
Removed empty file: Gino_dialog.txt
Removed empty file: Hannah_dialog.txt
Removed empty file: Hidetoshi_dialog.txt
Removed empty file: Hunter_dialog.txt
Removed empty file: Isabel_dialog.txt
Removed empty file: Josh_dialog.txt
Removed empty file: Justin_dialog.txt
Removed empty file: Katy_dialog.txt
Removed empty file: Leo_dialog.txt
Removed empty file: Lonny_dialog.txt
Removed empty file: Lynn_dialog.txt
Removed empty file: Madge_dialog.txt
Removed empty file: Matt_dialog

In [8]:
######################################################################################################################

In [9]:
import re
from collections import defaultdict

# Assume names_list is the list of character names you're interested in
#names_list = ["Michael", "Oscar", "Pam", ...]  # Replace with your actual list of names

# Function to find connections
def find_connections(folder_path, character_names_list):
    # Dictionary to store names and their connections
    connections = {name: [] for name in character_names_list}
    
    # Compile a regular expression pattern to match any name in the list
    names_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, character_names_list)) + r')\b')

    # Iterate over all text files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith("_dialog.txt"):
            file_path = os.path.join(folder_path, filename)

            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # Find all occurrences of names in the content
                found_names = names_pattern.findall(content)

                # Iterate over each found name and look for the next three connections
                for i, name in enumerate(found_names):
                    # Only proceed if name is in names_list and not already added
                    if name in character_names_list:
                        next_names = [n for n in found_names[i+1:i+4] if n in character_names_list and n != name]
                        connections[name].extend(next_names)

    # Remove duplicates and limit to 3 connections per name
    for name, conns in connections.items():
        connections[name] = list(dict.fromkeys(conns))[:7]

    return connections

# Example usage
folder_path = "C:/Users/Lado/Google Drive/SocialGraphs/Untitled Folder/episode_transcripts_pt1/name_dialogs" # Replace with your folder path
connections_dict = find_connections(folder_path, character_names_list)
print(connections_dict)

{'Dwight': ['Andy', 'Todd', 'The', 'Gabe', 'Stanley', 'Jim', 'Kevin'], 'Jim': ['Gabe', 'Phyllis', 'Toby', 'Pam', 'Dwight', 'Todd', 'Stanley'], 'Pam': ['Dwight', 'Todd', 'Erin', 'Gabe', 'Michael', 'The', 'Andy'], 'Stanley': ['Gabe', 'Andy', 'Creed', 'Oscar', 'Meredith', 'The', 'Jim'], 'Kevin': ['The', 'Erin', 'Stanley', 'Jim', 'Ryan', 'Creed', 'Andy'], 'Angela': ['Kevin', 'Robert', 'Dwight', 'Oscar', 'Creed', 'Meredith', 'Val'], 'Meredith': ['The', 'Dan', 'Gabe', 'Pam', 'Kevin', 'Oscar', 'Angela'], 'Phyllis': ['Toby', 'Gabe', 'Andy', 'Dwight', 'Creed', 'Michael', 'Holly'], 'Creed': ['Michael', 'Darryl', 'Andy', 'Jim', 'Pam', 'Erin', 'The'], 'Oscar': ['Meredith', 'The', 'Kevin', 'Jim', 'Stanley', 'Robert', 'Erin'], 'Ryan': ['Kevin', 'Creed', 'Andy', 'The', 'Darryl', 'Dwight', 'Gabe'], 'Kelly': ['Jessica', 'The', 'Erin', 'Dwight', 'Kevin', 'Oscar', 'Pam'], 'Andy': ['Creed', 'Michael', 'Darryl', 'State', 'The', 'Dwight', 'Todd'], 'Michael': ['Darryl', 'Andy', 'State', 'The', 'Jim', 'Phylli

In [10]:
# Remove characters with no connections from the dictionary
connections_dict = {k: v for k, v in connections_dict.items() if v}
print(connections_dict)

{'Dwight': ['Andy', 'Todd', 'The', 'Gabe', 'Stanley', 'Jim', 'Kevin'], 'Jim': ['Gabe', 'Phyllis', 'Toby', 'Pam', 'Dwight', 'Todd', 'Stanley'], 'Pam': ['Dwight', 'Todd', 'Erin', 'Gabe', 'Michael', 'The', 'Andy'], 'Stanley': ['Gabe', 'Andy', 'Creed', 'Oscar', 'Meredith', 'The', 'Jim'], 'Kevin': ['The', 'Erin', 'Stanley', 'Jim', 'Ryan', 'Creed', 'Andy'], 'Angela': ['Kevin', 'Robert', 'Dwight', 'Oscar', 'Creed', 'Meredith', 'Val'], 'Meredith': ['The', 'Dan', 'Gabe', 'Pam', 'Kevin', 'Oscar', 'Angela'], 'Phyllis': ['Toby', 'Gabe', 'Andy', 'Dwight', 'Creed', 'Michael', 'Holly'], 'Creed': ['Michael', 'Darryl', 'Andy', 'Jim', 'Pam', 'Erin', 'The'], 'Oscar': ['Meredith', 'The', 'Kevin', 'Jim', 'Stanley', 'Robert', 'Erin'], 'Ryan': ['Kevin', 'Creed', 'Andy', 'The', 'Darryl', 'Dwight', 'Gabe'], 'Kelly': ['Jessica', 'The', 'Erin', 'Dwight', 'Kevin', 'Oscar', 'Pam'], 'Andy': ['Creed', 'Michael', 'Darryl', 'State', 'The', 'Dwight', 'Todd'], 'Michael': ['Darryl', 'Andy', 'State', 'The', 'Jim', 'Phylli