In [1]:
# Import necessary packages
import networkx as nx
import plotly.graph_objs as go
import gradio as gr
import pandas as pd
from sqlalchemy import create_engine

In [6]:
# Connect to RDS
db_username = 'admin'
db_password = 'groupdb1234567890'
db_host = 'groupdb.clvyneobhqj2.us-east-1.rds.amazonaws.com'
db_name = 'IMDB_NORMALIZED'

engine = create_engine(f"mysql+mysqlconnector://{db_username}:{db_password}@{db_host}/{db_name}")

In [11]:
# Read data from RDS
query = "SELECT * FROM RelationshipFact"
RelationshipFact = pd.read_sql(query, engine)
RelationshipFact.head()

Unnamed: 0,relationshipId,person1_id,person1_professionId,person2_id,person2_professionId,movieCount,averageRating,averageNumVotes
0,1,nm0000001,18,nm0000328,18,1,7.0,47189.0
1,2,nm0000001,18,nm0000328,28,1,7.0,47189.0
2,3,nm0000001,18,nm0000328,33,1,7.0,47189.0
3,4,nm0000001,18,nm0000740,3,1,7.0,47189.0
4,5,nm0000001,18,nm0000740,25,1,7.0,47189.0


In [13]:
query = "SELECT * FROM Profession"
profession = pd.read_sql(query, engine)
profession.head()

Unnamed: 0,professionId,profession
0,0,editor
1,1,choreographer
2,2,production_department
3,3,writer
4,4,manager


In [14]:
query = "SELECT * FROM Person"
Person = pd.read_sql(query, engine)
Person.head()

Unnamed: 0,personId,primaryName,birthYear,deathYear
0,nm0000001,Fred Astaire,1899,1987
1,nm0000002,Lauren Bacall,1924,2014
2,nm0000003,Brigitte Bardot,1934,\N
3,nm0000004,John Belushi,1949,1982
4,nm0000005,Ingmar Bergman,1918,2007


In [15]:
query = "SELECT * FROM PersonProfession"
PersonProfession = pd.read_sql(query, engine)
PersonProfession.head()

Unnamed: 0,personId,professionId
0,nm0000083,0
1,nm0000180,0
2,nm0000419,0
3,nm0000425,0
4,nm0000774,0


In [25]:
"""
    Purpose of the Collaboration function : To find collaborations between a specific person and others in the IMDb dataset based on professions.
    Inputs:
    - Name: Name of the primary person (string).
    - Profession_1: Primary profession of interest (string).
    - Profession_2: Secondary profession for collaboration (string).
    Process: Retrieves profession IDs, finds the primary person's ID, and gathers collaboration data.
    Output: DataFrame containing aggregated collaboration data based on movie count, average rating, and average number of votes.

"""
def collaboration(Name, Profession_1, Profession_2):


    primary_profession_id = profession[profession['profession'] == Profession_1]['professionId'].iloc[0]
    secondary_profession_id = profession[profession['profession'] == Profession_2]['professionId'].iloc[0]
    primary_person_name = Name

    query_result = Person.merge(PersonProfession, on='personId') \
                         .query("primaryName == @primary_person_name and professionId == @primary_profession_id")

    if not query_result.empty:
        primary_person_id = query_result['personId'].iloc[0]
    else:
        primary_person_id = None

    if primary_person_id is not None:
        collaborations = RelationshipFact.merge(PersonProfession, left_on='person2_id', right_on='personId') \
                                         .query("person1_id == @primary_person_id and professionId == @secondary_profession_id") \
                                         [['person2_id', 'movieCount', 'averageRating', 'averageNumVotes']]
        collaborations = collaborations.merge(Person, left_on='person2_id', right_on='personId', how='left') \
                                       .rename(columns={'primaryName': 'person2_name'}) \
                                       [['person2_name', 'movieCount', 'averageRating', 'averageNumVotes']]
        aggregated_collaborations = collaborations.groupby('person2_name').agg({
            'movieCount': 'mean',
            'averageRating': 'mean',
            'averageNumVotes': 'mean'
        }).reset_index()
        return aggregated_collaborations
    else:
        return pd.DataFrame()


def rank_values_ascending(data, interest):
    values = data[interest]
    sorted_values = sorted(values)
    rank_dict = {v: i + 1 for i, v in enumerate(sorted(set(sorted_values)))}
    data[interest] = [rank_dict[v] for v in values]
    return data

"""
The Purpose of the fucntion below is to create an interactive network graph using Plotly based on collaboration data.
Inputs:
- Name: Central node name (string).
- Profession_1: First profession (string).
- Profession_2: Second profession (string).
- interest: Column to determine node size (string).
Output: Plotly figure of the network graph.

"""
def create_interactive_network(Name, Profession_1, Profession_2, interest):

    data = collaboration(Name, Profession_1, Profession_2)
    if data.empty:
        return "No data found for the given inputs."
    data = rank_values_ascending(data, interest)


    G = nx.Graph()
    G.add_node(Name, size=20, color='red')  # Central node color

    for position, count in zip(data['person2_name'], data[interest]):
        # Assign a default color for each node, e.g., 'blue'
        G.add_node(position, size=count * 10, color='blue')
        G.add_edge(Name, position)

    pos = nx.spring_layout(G)

    edge_trace = go.Scatter(x=[], y=[], line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_trace['x'] += tuple([x0, x1, None])
        edge_trace['y'] += tuple([y0, y1, None])

    node_trace = go.Scatter(x=[], y=[], text=[], mode='markers+text', hoverinfo='text',
                            marker=dict(showscale=False, colorscale='YlGnBu', size=[], color=[], opacity=0.8, line_width=2))
    for node in G.nodes():
        x, y = pos[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['text'] += tuple([f'{node}'])
        node_trace['marker']['size'] += tuple([G.nodes[node]['size']])
        # Use the color attribute from the node
        node_trace['marker']['color'] += tuple([G.nodes[node]['color']])

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(title=f'Network graph with {Name} as the central node',
                                     titlefont_size=16, showlegend=False, hovermode='closest',
                                     margin=dict(b=20, l=5, r=5, t=40),
                                     xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                     yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
    return fig
# Define the list of options for the 'interest' parameter
interest_options = ['movieCount', 'averageRating', 'averageNumVotes']

# Create a Gradio interface
# Interface for generating a network graph based on IMDb collaborations.
def gradio_network_graph(Name, Profession_1, Profession_2, interest):
    fig = create_interactive_network(Name, Profession_1, Profession_2, interest)
    return fig

iface = gr.Interface(
    fn=gradio_network_graph,
    inputs=[
        gr.Textbox(label="Central Node Name"),
        gr.Textbox(label="Profession 1"),
        gr.Textbox(label="Profession 2"),
        gr.Dropdown(choices=interest_options, label="Interest")
    ],
    outputs=gr.Plot(label="Network Graph"),
    title="Network Graph Generator",
    description="Enter the central node name, two professions, and select an interest column to generate a network graph."
)

# Run the interface
iface.launch(share=True)


Running on local URL:  http://127.0.0.1:7864
Running on public URL: https://eac19e5ff9245fc94b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


