#### Assignment Prompt

Identify a large 2-node network dataset—you can start with a dataset in a repository.  Your data should meet the criteria that it consists of ties between and not within two (or more) distinct groups.
Reduce the size of the network using a method such as the island method described in chapter 4 of social network analysis.
What can you infer about each of the distinct groups?

In [14]:
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import pandas as pd
import time

#### Loading Ratings and Movies Dataset
* Originally exported from TMDB API and uploaded to [Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)
* It is a listing of movies within the TMDB database along with relevant metadata including basic informational items on the films as well as an average score

In [2]:
ratings = pd.read_csv('/Users/JoshForster/Desktop/Masters_Data_Sci/Data620/archive-2/ratings_small.csv')
print(ratings.shape)
ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
links = pd.read_csv('/Users/JoshForster/Desktop/Masters_Data_Sci/Data620/archive-2/links.csv')
links['tmdbId'] = links['tmdbId'].apply(lambda x: str(x).split('.')[0]).astype(str)
print(links.shape)
links.head()

(45843, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [4]:
links.dtypes

movieId     int64
imdbId      int64
tmdbId     object
dtype: object

In [5]:
movies = pd.read_csv('/Users/JoshForster/Desktop/Masters_Data_Sci/Data620/archive-2/movies_metadata.csv')
print(movies.shape)
movies.head()

(45466, 24)


  movies = pd.read_csv('/Users/JoshForster/Desktop/Masters_Data_Sci/Data620/archive-2/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [7]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
test = pd.merge(ratings,links,how='inner',left_on='movieId',right_on='movieId')
test.shape

In [8]:
subset_movies = movies[['id','imdb_id','original_title','revenue','runtime','vote_count','vote_average']]
movie_ratings = pd.merge(pd.merge(ratings,links,how='inner',left_on='movieId',right_on='movieId'),\
                         subset_movies,how='inner',left_on='tmdbId',right_on='id')
print(movie_ratings.shape)
movie_ratings.head()

(99850, 13)


Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,id,imdb_id,original_title,revenue,runtime,vote_count,vote_average
0,1,31,2.5,1260759144,112792,9909,9909,tt0112792,Dangerous Minds,180000000.0,99.0,249.0,6.4
1,7,31,3.0,851868750,112792,9909,9909,tt0112792,Dangerous Minds,180000000.0,99.0,249.0,6.4
2,31,31,4.0,1273541953,112792,9909,9909,tt0112792,Dangerous Minds,180000000.0,99.0,249.0,6.4
3,32,31,4.0,834828440,112792,9909,9909,tt0112792,Dangerous Minds,180000000.0,99.0,249.0,6.4
4,36,31,3.0,847057202,112792,9909,9909,tt0112792,Dangerous Minds,180000000.0,99.0,249.0,6.4


In [9]:
movie_ratings.isnull().sum()

userId            0
movieId           0
rating            0
timestamp         0
imdbId            0
tmdbId            0
id                0
imdb_id           0
original_title    0
revenue           0
runtime           0
vote_count        0
vote_average      0
dtype: int64

In [None]:
small = pd.merge(movie_ratings,movie_ratings.groupby(['original_title'])['id'].count().sort_values(ascending=False).head(3).reset_index(),how='inner',on='original_title',suffixes=['','_right'])
small.head()

In [None]:
small.shape

Only the movies/ratings in common are included for purposes of this analysis given the focus on bipartite data with ties between groups

In [10]:
pos_mov = [(1,idx) for idx,x in enumerate(movie_ratings['id'].values)]
pos_rating = [(2,idx) for idx,x in enumerate(movie_ratings['movieId'].values)]

In [11]:
edges_prep = list(zip(movie_ratings.userId,movie_ratings.original_title,movie_ratings.rating))

In [12]:
#https://stackoverflow.com/questions/35472402/how-do-display-bipartite-graphs-with-python-networkx-package
B = nx.Graph()
B.add_nodes_from(movie_ratings['original_title'].values, bipartite=0)
B.add_nodes_from(movie_ratings['userId'].values, bipartite=1)
B.add_weighted_edges_from(edges_prep)

top,bottom = nx.bipartite.sets(B)
#pos = nx.bipartite_layout(B, top)
#nx.draw(B, with_labels=True)
#plt.show()

In [13]:
nx.is_connected(B)

True

In [None]:
#takes long time to run
nx.draw_networkx(B,with_labels=True)

As we can see from this graph there is a massive hairball visible with a dense cluster of connections that exist for this dataset and incorporating some type of filtering is a must to be able to do any real analysis or identify discernable patterns.

In [None]:
name_nodes, event_nodes = bipartite.sets(B)
movie_n = set(n for n,d in B.nodes(data=True) if d['bipartite']==0)
name_nodes = set(B) - event_nodes
name_graph = bipartite.weighted_projected_graph(B, name_nodes)

In [None]:
#cc_sub = nx.connected_component_subgraphs(B)
check_cc = [len(c) for c in nx.connected_component_subgraphs(B)]
check_cc

In [None]:
bipartite.weighted_projected_graph()

In [20]:
print('test')

test


In [15]:
#cannot get this to run with weights but that is needed for standard island method
start = time.time()
weighted_graph=bipartite.weighted_projected_graph(B,list(movie_ratings['userId'].values))
total_time = time.time() - start

In [23]:
print('Nodes:',weighted_graph.number_of_nodes(),r'; Edges:',weighted_graph.number_of_edges())

Nodes: 8786 ; Edges: 10602894


In [None]:
bottom_nodes, top_nodes = bipartite.sets(B)
G = bipartite.projected_graph(B, bottom_nodes)

In [None]:
site_graph = bi.weighted_projected_graph(s, list(sites_clean.Site.values), ratio=False)
In [125]:
site_graph = list(nx.connected_component_subgraphs(site_graph))[0]

In [None]:
cc_sub = nx.connected_component_subgraphs(G)
check_cc = [len(c) for c in nx.connected_component_subgraphs(G)]
check_cc

In [None]:
for idx,node in enumerate(G.nodes()):
    if idx<=4:
        print(G.nodes[node])

In [None]:
for idx,f, to, edata in enumerate(B.edges(data=True)):
    if print(edata)

In [26]:
def trim_edges(g, weight = 1):
    edges = []
    g2 = nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['weight']> weight:
            x = float(edata['weight'])
            edges.append((f,to,x))
    g2.add_weighted_edges_from(edges)
    return g2


def island_method(g, iterations=5):
    '''
    Social Network Analysis Textbook Method for isolating 
    biggest components of a network  by using centrality criteria to show most dense clusters'''
    weights = [edata['weight'] for f, to, edata in g.edges(data=True)]
    
    #size of step
    mn=int(min(weights))
    mx=int(max(weights))
    step = int((mx-mn)/iterations)
    
    return [[threshold, trim_edges(g, threshold)] for threshold in range(mn, mx, step)]

In [None]:
nx.c

In [18]:
#missing weight component at the moment which is why weighted projected graph is needed
movie_connect_comp = nx.connected_component(weighted_graph)
movie_islands = island_method(movie_connect_comp)

AttributeError: module networkx has no attribute connected_component_subgraphs

In [20]:
#https://stackoverflow.com/questions/61154740/attributeerror-module-networkx-has-no-attribute-connected-component-subgraph
cc = (weighted_graph.subgraph(c) for c in nx.connected_components(weighted_graph))

In [27]:
movie_islands = island_method(weighted_graph)

In [30]:
for i in movie_islands:
    print(i[0], nx.number_of_nodes(i[1]), nx.number_of_edges(i[1]), nx.number_connected_components(i[1]))

1 671 174039 1
163 124 1204 1
325 37 166 1
487 17 37 1
649 7 7 1
811 2 1 1
