<a href="https://colab.research.google.com/github/jordanco-bgu/social_network_movies/blob/main/Social_network_attributes_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Social Network Attributes Dataset

Parsing and defining attributes of the social networks created by Kagan, Dima, Thomas Chesney, and Michael Fire in their study "Using data science to understand the film industry's gender gap." Data was found on Kaggle in .json files where each file represented the social network of a singular movie.

Notes on how the social networks were constructed:
* links between characters were created if they appeared in the movie in a time interval of less than 60 seconds
* for each appearance, the weight of the edge between them was increased by 1
* all edges with weight lower than 3 were filtered out to account for possible mistakes

In [None]:
import json
import os

!pip install kaggle

api_token = {"username":"","key":""}

!mkdir /root/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as file:
  json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json



In [None]:
!mkdir ./datasets
!mkdir ./datasets/movie-dynamics-networks

!kaggle datasets download michaelfire/movie-dynamics-over-15000-movie-social-networks -p ./datasets/movie-dynamics-networks
!unzip ./datasets/movie-dynamics-networks/*.zip  -d ./datasets/movie-dynamics-networks/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cub.actors.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cub.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cuban_Fury.actors.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cuban_Fury.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cut_Bank.actors.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cut_Bank.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cut_Snake.actors.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cut_Snake.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cymbeline.actors.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Cymbeline.json  
  inflating: ./datasets/movie-dynamics-networks/moviedynamics/2014_Da_Sw

In [None]:
def get_network_strength(edges):
  total_weight = 0

  for edge in edges:
    total_weight = total_weight + edge['weight']

  return total_weight

def get_edge_with_max_weight(edges):
  max_weight = 0
  max_edge_source = None
  max_edge_target = 0

  for edge in edges:
    if edge['weight'] > max_weight:
      max_weight = edge['weight']
      max_edge_source = edge['source']
      max_edge_target = edge['target']

  return max_weight, max_edge_source, max_edge_target

In [None]:
def create_edges_list(network_file):
  sources_list = []
  targets_list = []
  edges_list = []

  for link in network_file['links']:
    sources_list.append(link['source'])
    targets_list.append(link['target'])

  edges_list = list(zip(sources_list, targets_list))
  return edges_list

In [None]:
import networkx as nx

def get_connected_components(graph):
  return nx.number_connected_components(graph)

def get_main_characters(graph):
  main_characters_list = []
  centrality_threshold = 0.6

  degree_central_characters = dict(sorted(nx.degree_centrality(graph).items(), key=lambda x:x[1], reverse=True))
  for character in degree_central_characters.items():
    if character[1] > centrality_threshold:
      main_characters_list.append(character[0])

  return len(main_characters_list)

def get_node_with_max_degree(graph):
  max_degree = 0
  max_node = None

  node_degree_list = list(graph.degree())

  for item in node_degree_list:
    if item[1] > max_degree:
      max_degree = item[1]
      max_node = item[0]

  return max_degree, max_node

def run_graph_algos(movie_network_file):

  edges_list = create_edges_list(movie_network_file)
  movie_graph = nx.Graph(edges_list)

  cc = get_connected_components(movie_graph)
  num_main_characters = get_main_characters(movie_graph)
  max_degree, max_degree_node = get_node_with_max_degree(movie_graph)
  return cc, num_main_characters, max_degree, max_degree_node

In [None]:
def get_attributes_from_json(filename):
  attributes = {}

  with open(filename,'r') as json_file :
    movie_network_file = json.load(json_file)

  attributes['movie_name'] = movie_network_file['graph']['movie_name']
  attributes['year'] = movie_network_file['graph']['movie_year']

  attributes['num_characters'] = len(movie_network_file['nodes'])
  attributes['num_connections'] = len(movie_network_file['links'])

  attributes['num_connected_components'], attributes['num_main_characters'], attributes['max_degree_node_value'], attributes['max_degree_node'] = run_graph_algos(movie_network_file)

  attributes['network_strength'] = get_network_strength(movie_network_file['links'])
  attributes['max_weight_edge_value'], attributes['max_weight_edge_source'], attributes['max_weight_edge_target'] = get_edge_with_max_weight(movie_network_file['links'])

  return attributes

In [None]:
import glob
import pandas as pd

files_list = []

for i in filter(lambda x: not ('.actors.' in x), glob.glob('./datasets/movie-dynamics-networks/moviedynamics/*')):
    files_list.append(i)

social_network_data = []

for json_file in files_list:
  attributes = get_attributes_from_json(json_file)
  social_network_data.append(attributes)

social_network_df = pd.DataFrame(social_network_data)
social_network_df

Unnamed: 0,movie_name,year,num_characters,num_connections,num_connected_components,num_main_characters,max_degree_node_value,max_degree_node,network_strength,max_weight_edge_value,max_weight_edge_source,max_weight_edge_target
0,Inherent Vice,2014,35,154,1,3,32,"Larry ""Doc"" Sportello",2363,220,"Larry ""Doc"" Sportello",Sortilège
1,The Ladies Man,1961,8,12,1,1,7,Herbert H. Heebert - Mama Heebert,173,54,Herbert H. Heebert - Mama Heebert,Katie
2,Salmon Fishing in the Yemen,2011,10,27,1,6,9,Dr. Alfred Jones,267,39,Dr. Alfred Jones,Bernard Sugden
3,American Samurai,1992,5,6,1,2,4,Andrew 'Drew' Collins,50,17,Kenjiro Sanga,Andrew 'Drew' Collins
4,A Prayer for the Dying,1987,15,27,1,2,11,Jack Meehan,224,42,Jack Meehan,Martin Fallon
...,...,...,...,...,...,...,...,...,...,...,...,...
15533,Bottle Shock,2008,18,47,1,1,14,Bo Barrett,410,46,Bo Barrett,Upscale Man
15534,Mankatha,2011,8,14,1,3,5,Ganesh,87,15,Ganesh,Mahat
15535,Righteous Kill,2008,17,46,1,1,15,Turk,570,53,Detective Simon Perez,Turk
15536,Angoor,1982,9,19,1,3,8,Bahadur,177,38,Bahadur,Ashok R. Rilak


In [None]:
# convert dataframe to csv

social_network_df.to_csv('./datasets/social_network_attributes_dataset.csv')