In [None]:
# Official data processing file used for the project. Reads from the "imdb_movies_subset.csv" dataset. 
# references used:
# CPSC 572 D2L shell Jupyter notebook files 
# https://www.cambridgespark.com/info/data-processing-with-pandas-dataframe
# https://www.geeksforgeeks.org/data-processing-with-pandas/

In [121]:
import pandas as pd
from itertools import combinations

# Reset display options to default values
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [122]:
#to adjust the auto scroll in console

%%javascript

window.scroll_flag = true
window.scroll_exit = false
window.scroll_delay = 100

$(".output_scroll").each(function() {
    $(this)[0].scrollTop = $(this)[0].scrollHeight;
});

function callScrollToBottom() {
    setTimeout(scrollToBottom, window.scroll_delay);
}

function scrollToBottom() {
    if (window.scroll_exit) {
        return;
    }
    if (!window.scroll_flag) {
        callScrollToBottom();
        return;
    };
    
    $(".output_scroll").each(function() {
        if (!$(this).attr('scroll_checkbox')){
            window.scroll_flag = true;
            $(this).attr('scroll_checkbox',true);
            var div = document.createElement('div');
            var checkbox = document.createElement('input');
            checkbox.type = "checkbox";
            checkbox.onclick = function(){window.scroll_flag = checkbox.checked}
            checkbox.checked = "checked"
            div.append("Auto-Scroll-To-Bottom: ");
            div.append(checkbox);
            $(this).parent().before(div);
        }
        
        $(this)[0].scrollTop = $(this)[0].scrollHeight;
    });
    callScrollToBottom();
}
scrollToBottom();

SyntaxError: invalid syntax (2335578460.py, line 9)

In [123]:
#TURN EACH MOVIE INTO AN ARRAY OF CREW NAMES

# Read the CSV file into DataFrame
movies = pd.read_csv("imdb_movies_subset.csv")

# turn each crew string into array of separate names
movies['crew'] = [str(crew).split(', ') if isinstance(crew, (list, str)) and not pd.isnull(crew) and crew.strip() != "" else [] for crew in movies['crew']]

# Extract only the even-numbered indexes for each array
movies['crew'] = [[names[i] for i in range(len(names)) if i % 2 == 0] for names in movies['crew']]

print(len(movies))
print(movies['crew'])

3000
0       [Isabella Castillo, Alan Estrada, Cecilia Tous...
1       [Leonardo DiCaprio, Jesse Plemons, Lily Gladst...
2       [Taraji P. Henson, Kristen Bell, Christian Con...
3       [Jason Statham, Dolph Lundgren, 50 Cent, Sylve...
4       [Taissa Farmiga, Bonnie Aarons, Storm Reid, Jo...
5       [Denzel Washington, Dakota Fanning, Gaia Scode...
6       [Xolo Mariduena, Susan Sarandon, Bruna Marquez...
7       [Jason Statham, Shuya Sophia Cai, Cliff Curtis...
8       [Nicolas Cantu, Shamon Brown Jr., Micah Abbey,...
9       [Aida Roa, Antonia Olivares, Nailea Norvind, F...
10      [Margot Robbie, Ryan Gosling, America Ferrera,...
11      [Cillian Murphy, Emily Blunt, Matt Damon, Robe...
12      [Tom Cruise, Rebecca Ferguson, Simon Pegg, Vin...
13      [Patrick Wilson, Ty Simpkins, Rose Byrne, Lin ...
14      [Anouck Hautbois, Benjamin Bollen, Antoine Tom...
15      [Harrison Ford, Phoebe Waller-Bridge, Mads Mik...
16      [Lana Condor, Toni Collette, Annie Murphy, Sam...
17      [

In [124]:
# CREATE EDGES INTO A DF FOR FIRST 700 MOVIES

# Initialize a list to store the edges
edges = []

# Iterate over each array in the names_array. first 700 movies
for names_list in movies['crew'][0:700]:
    # Generate all possible combinations of pairs of names
    name_pairs = list(combinations(names_list, 2))
    
    # Append each pair as a dictionary to the edges list
    for source, target in name_pairs:
        edges.append({'source': source, 'target': target, 'weight': 1})

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(edges)

# Print the resulting DataFrame
print(len(df))
print(df)


20674
                                    source  \
0                        Isabella Castillo   
1                        Isabella Castillo   
2                        Isabella Castillo   
3                        Isabella Castillo   
4                        Isabella Castillo   
5                        Isabella Castillo   
6                        Isabella Castillo   
7                        Isabella Castillo   
8                             Alan Estrada   
9                             Alan Estrada   
10                            Alan Estrada   
11                            Alan Estrada   
12                            Alan Estrada   
13                            Alan Estrada   
14                            Alan Estrada   
15                       Cecilia Toussaint   
16                       Cecilia Toussaint   
17                       Cecilia Toussaint   
18                       Cecilia Toussaint   
19                       Cecilia Toussaint   
20                       Cec

In [125]:
# looking at names that appear many times
source_counts = df['source'].value_counts()

# Filter the DataFrame to include only entries where the source appears more than 40 times
filtered_df = df[df['source'].isin(source_counts.index[source_counts > 40])]
print(filtered_df)

filtered_df = df[(df['source'] == 'Bruce Willis') | (df['target'] == 'Bruce Willis')]

# Print the filtered DataFrame
print(filtered_df)

             source               target  weight
4974   Bruce Willis       Mustafa Shakir       1
4975   Bruce Willis      Dominic Purcell       1
4976   Bruce Willis     Fernanda Andrade       1
4977   Bruce Willis     Barry Jay Minoff       1
4978   Bruce Willis      Eugenia Kuzmina       1
4979   Bruce Willis     Hannah Quinlivan       1
10193  Bruce Willis          Jack Kilmer       1
10194  Bruce Willis        Lochlyn Munro       1
10195  Bruce Willis     Jimmy Jean-Louis       1
10196  Bruce Willis       Willow Shields       1
10197  Bruce Willis           Dina Meyer       1
10198  Bruce Willis    Timothy V. Murphy       1
10199  Bruce Willis    Lorenzo Antonucci       1
10200  Bruce Willis           Joe Munroe       1
11234  Bruce Willis         Blake Jenner       1
11235  Bruce Willis        Stephen Dorff       1
11236  Bruce Willis          Corey Large       1
11237  Bruce Willis  Branscombe Richmond       1
11238  Bruce Willis    Lorenzo Antonucci       1
11239  Bruce Willis 

In [167]:
# Group the DataFrame by 'source' and 'target' columns and filter for groups with more than one occurrence
duplicate_groups = df.groupby(['source', 'target']).filter(lambda x: len(x) >= 2)

# Sort the duplicate groups by 'source' and 'target' columns
duplicate_groups = duplicate_groups.sort_values(by=['source', 'target'])

# Print the duplicate entries
print(duplicate_groups)
print(f"______________________")
print(df)


                           source                      target  weight
1487                                                                1
1488                                                                1
1489                                                                1
18401                 Aldis Hodge               Noah Centineo       1
18938                 Aldis Hodge               Noah Centineo       1
18407                 Aldis Hodge              Pierce Brosnan       1
18940                 Aldis Hodge              Pierce Brosnan       1
18403                 Aldis Hodge          Quintessa Swindell       1
18939                 Aldis Hodge          Quintessa Swindell       1
6118                America Young                   Greg Chun       1
17837               America Young                   Greg Chun       1
6121                America Young                  Lisa Fuson       1
17841               America Young                  Lisa Fuson       1
4925                

                                    source  \
0                        Isabella Castillo   
1                        Isabella Castillo   
2                        Isabella Castillo   
3                        Isabella Castillo   
4                        Isabella Castillo   
5                        Isabella Castillo   
6                        Isabella Castillo   
7                        Isabella Castillo   
8                             Alan Estrada   
9                             Alan Estrada   
10                            Alan Estrada   
11                            Alan Estrada   
12                            Alan Estrada   
13                            Alan Estrada   
14                            Alan Estrada   
15                       Cecilia Toussaint   
16                       Cecilia Toussaint   
17                       Cecilia Toussaint   
18                       Cecilia Toussaint   
19                       Cecilia Toussaint   
20                       Cecilia T

In [168]:
# Create copy
edges_weighted = df.copy()

# Identify duplicate rows
duplicates = edges_weighted.duplicated(subset=['source', 'target'], keep=False)

# Update weights for duplicate rows in the copy
edges_weighted['weight'] = edges_weighted.groupby(['source', 'target'])['weight'].transform('sum')

# Keep only one instance of each duplicate in the copy
edges_weighted = edges_weighted[~duplicates | edges_weighted.duplicated(subset=['source', 'target'], keep='first')]

# Reset index in the copy
edges_weighted = edges_weighted.reset_index(drop=True)

print("\nModified DataFrame (edges_weighted):")
print(len(edges_weighted))


Modified DataFrame (edges_weighted):
20496


In [172]:
# looking at edges with weight more than 1
filtered_df = edges_weighted[edges_weighted['weight'] > 1]
print(filtered_df)

                           source                      target  weight
1422                    Wang Yibo                                   3
1423                    Wang Yibo                                   3
1430                           Yu                                   3
1431                           Yu                                   3
1437                        Zhang                                   3
1438                        Zhang                                   3
1443                  Zhou Dongyu                                   3
1444                  Zhou Dongyu                                   3
1448                  Xu Kaicheng                                   3
1449                  Xu Kaicheng                                   3
1452                        Bu Yu                                   3
1453                        Bu Yu                                   3
1455                      Yosh Yu                                   3
1456                

In [174]:
edges_weighted[0:50]
edges_weighted.to_csv('edges_weighted2.csv', index=False)

In [204]:
#making nodes list
movies['crew'][0:700]

node_df = movies['crew'][0:700].astype(str).str.split(', ')

# Using list comprehension to flatten the array of arrays into a single list
flattened_array = [name.replace('[', '').replace(']', '').replace("'", "") for inner_arr in node_df for name in inner_arr]

#print(flattened_array)

# Initialize an empty list to store unique values
unique_values = []

# Iterate over each element in the flattened array
for element in flattened_array:
    # If the element is not already in the unique_values list, add it
    if element not in unique_values:
        unique_values.append(element)

# Print the unique values
print(len(unique_values))


4958


In [207]:
nodes = pd.DataFrame(unique_values)
nodes.to_csv('nodes2.csv', index=True)