# LinkedIn Connection Discovery
---

### Import Libraries

In [None]:
import pandas as pd 
import numpy as np
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from xhtml2pdf import pisa
from bs4 import BeautifulSoup
from pyvis import network as net
import networkx as nx

### Define functions

In [None]:
def convert_nonenglish_text(text):
    if text == '香港匯豐銀行':
        text = 'HSBC'
    elif text == '中國銀行（香港）有限公司':
        text = 'Bank of China'
    return text

def get_lang_detector(nlp, name):
    return LanguageDetector()

def strip_empty_lines(text):
    lines = text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    string_without_empty_lines = ''
    for line in non_empty_lines:
        string_without_empty_lines += line + "\n"
    return string_without_empty_lines

def transcribe_html(in_html_file, out_html_file, mode, bs_code, bs_elem, new_html_file, *args):
    chart_html_file = open(in_html_file, 'r', encoding='utf-8')
    soup = BeautifulSoup(chart_html_file, 'html.parser')
    
    if new_html_file != None:
        with open(new_html_file, 'r', encoding='utf-8') as f:
            new_file = f.read()
        html_string = str(BeautifulSoup(new_file, 'html.parser'))
    
    if mode == 'chart':
        chart_div = str(soup.find_all('div')[1])
        chart_div = chart_div.split()
        if bs_elem == 'class':
            chart_div[1] = chart_div[1][:-1] + bs_code
            chart_div = ' '.join(chart_div)

        with open(out_html_file, 'r', encoding='utf-8') as f:
            html_index = f.read()
            html_string = str(html_index)
            html_string = html_string.replace(args[0], chart_div)
            html_string = html_string.replace(args[1], str(soup.script))

        with open(out_html_file, 'w', encoding='utf-8') as f:
            f.write(html_string)
    elif mode == 'network':
        network_style_internal = strip_empty_lines(str(soup.style))
        find_hash = network_style_internal.find('#')
        find_space = network_style_internal.find(' ', find_hash)
        network_id = network_style_internal[find_hash + 1:find_space]

        network_style_cdn = str(soup.link)
        network_script_cdn = strip_empty_lines(str(soup.find_all('script')[0]))
        network_script_internal = strip_empty_lines(str(soup.find_all('script')[1]))
        network_div = str(soup.find(id=network_id))
        network_div = network_div.split()
        network_div.insert(1, bs_code)
        network_div = ' '.join(network_div)
        
        html_string = html_string.replace('(network_style_cdn)', network_style_cdn)
        html_string = html_string.replace('(network_script_cdn)', network_script_cdn)
        html_string = html_string.replace('(network_style_internal)', network_style_internal)
        html_string = html_string.replace('(network_script_internal)', network_script_internal)
        html_string = html_string.replace('(network1)', network_div)

        with open(out_html_file, 'w+', encoding='utf-8') as f:
            f.write(html_string)

    chart_html_file.close()
    del soup
    
def convert_html_to_pdf(source_html, output_filename):
    result_file = open(output_filename, "w+b")
    pisa_status = pisa.CreatePDF(
            source_html,
            dest=result_file)

    result_file.close()
    if pisa_status.err:
        return 'Could not generate PDF.  Please try again.'
    else:
        return 'PDF generated!'

### Settings

In [None]:
html_template_fname = 'index.html'
output_html_fname = 'index_revised.html'

### pdf inputs (currently not functional) ###
pdf_fname = 'stats.pdf'
bool_to_pdf = False

### Load and process connections data

In [None]:
network_data = pd.read_csv('./Connections.csv', skiprows=3).dropna().reset_index(drop=True)
network_data_count = network_data['Company'].value_counts().reset_index()
network_data_count_pos = network_data['Position'].value_counts().reset_index()
network_data_count.columns = ['Company', 'Count']
network_data_count = network_data_count.sort_values(by="Count", ascending=False)
network_data_count_comp = network_data_count.loc[network_data_count['Count']>2]
network_data_count_pos.columns = ['Position', 'Count']
network_data_count_pos = network_data_count_pos.loc[network_data_count_pos['Count']>2]

network_data['Connected On'] = pd.to_datetime(network_data['Connected On'])
network_data['Company'] = network_data['Company'].apply(convert_nonenglish_text)

### Set up Plotly figure layout

In [None]:
fig = go.Figure().set_subplots(
    rows=1, cols=2,
    specs=[[{'type': 'pie'}, {'type': 'treemap'}]]
)

fig2, fig3 = go.Figure(), go.Figure()

general_config = {
    'toImageButtonOptions': {
        'format': 'jpeg',
        'filename': 'custom_image',
        'height': 700,
        'width': 700,
        'scale': 1
    },
    'responsive': True
}

### Generate treemap of connections

In [None]:
top_n = 10
treemap_data = network_data.groupby('Company').count().sort_values('Connected On', ascending=False).reset_index().head(top_n)
labels = treemap_data['Company']
values = treemap_data['First Name']
parents = ['(click to collapse)'] * len(labels)

fig.add_trace(
    go.Treemap(
        labels=labels,
        values=values,
        parents=parents,
        textinfo='label+value+percent parent',
        texttemplate='%{label}<br># of connections: %{value}',
        hoverinfo='none',
    ), row=1, col=1
)

annotations = list()
annotations.append(
    dict(x=0.0, y=1.05,
         xanchor='left', yanchor='middle',
         text='',
         # text='<b>Top Connections</b>',
         font=dict(size=15,
                   color='rgb(37,37,37)'),
         showarrow=False)
)

fig.update_layout(
    annotations=annotations
);

### Identify proportions in job positions found in connections

In [None]:
fig.add_trace(
    go.Pie(labels=network_data_count_pos['Position'], 
           values=network_data_count_pos['Count'], 
           pull=0.015,
           name='',
           marker_colors=px.colors.qualitative.Set3,
           hole=0.4, hovertemplate='Role: %{label}<br>Count: %{value}<br>% of total: %{percent}'), 
    row=1, col=2
)

annotations = list()
annotations.append(
    dict(x=0.0, y=1,
         xanchor='auto', yanchor='middle',
         text='',
         # text='<b>Top {} connected positions</b>'.format(network_data_count_pos.shape[0]),
         font=dict(size=15,
                   color='rgb(37,37,37)'),
         showarrow=False)
)

fig.update_layout(
    annotations=annotations
)

fig.write_html('chart1.html', include_plotlyjs=False, full_html=False, config=general_config)

### Generate timeline of established connections

In [None]:
line_data = network_data.groupby('Connected On').count().reset_index()
start_x = line_data['Connected On'][0]
start_y = line_data['First Name'][0]
last_x = line_data['Connected On'][line_data.shape[0]-1]
last_y = line_data['First Name'][line_data.shape[0]-1]
max_y = line_data.loc[np.argmax(line_data['First Name']), 'First Name']
max_x = line_data.loc[np.argmax(line_data['First Name']), 'Connected On']

fig2.add_trace(
        go.Scatter(
            x=line_data['Connected On'], 
            y=line_data['First Name'],
            hovertemplate='Date: %{x}<br>Count: %{y}',
            hoverlabel=dict(bgcolor='white', font=dict(color='black')),
            opacity=0.5,
            mode='lines',
            name='',
            line=dict(width=1)
        )
)

fig2.add_trace(
    go.Scatter(
        x=[start_x],
        y=[start_y],
        mode='markers',
        hovertemplate='Date: %{x}<br>Count: %{y}',
        hoverlabel=dict(bgcolor='white', font=dict(color='black')),
        name='',
        marker=dict(opacity=0.85)
    )
)

annotations = list()
annotations.append(
    dict(xref='paper', yref='paper', x=0.0, y=1.05,
         xanchor='left', yanchor='bottom',
         text='',
         # text='<b>LinkedIn Connections</b>',
         font=dict(size=15,
                   color='rgb(37,37,37)'),
         showarrow=False)
)
annotations.append(
    dict(x=max_x,
         y=max_y,
         text=f'Top connections: {max_y}',
         font=dict(size=10,
                   color='rgb(37,37,37)'),
         showarrow=True,
         arrowhead=1, 
         yshift=5)
)

annotations.append(
    dict(x=start_x,
         y=start_y,
         xanchor='right', yanchor='middle',
         text='{}: {}'.format(start_x.strftime('%Y-%m-%d'), start_y),
         xshift=-5,
         font=dict(size=10),
         showarrow=False)
)

annotations.append(
    dict(x=last_x,
         y=last_y,
         xanchor='left', yanchor='middle',
         text='{}: {}'.format(last_x.strftime('%Y-%m-%d'), last_y),
         xshift=5,
         font=dict(size=10),
         showarrow=False)
)

fig2.update_layout(
    xaxis=dict(
        showline=True, 
        showgrid=False,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=1,
        ticks='outside',
        tickcolor='rgb(204, 204, 204)',
        tickfont=dict(
            size=10,
            color='rgb(82, 82, 82)',
        )
    ), 
    yaxis=dict(
        showline=False, 
        showgrid=False, 
        zeroline=False, 
        showticklabels=False
    ),
    plot_bgcolor='white',
    showlegend=False,
    annotations=annotations,
);

fig2.write_html('chart2.html', include_plotlyjs=False, full_html=False, config=general_config)

### Create simple network graph of top connected companies

In [None]:
G = nx.Graph()

for _, row in network_data_count_comp.iterrows():
    company = row['Company']

    positions = set([x for x in network_data[network_data['Company'] == company]['Position']])
    count = len(positions)
    positions = ''.join('<li>{}</li>'.format(x) for x in positions)
    title = f"<b>{company}</b> – {count}"

    position_list = f"<ul>{positions}</ul>"
    hover_info = title + position_list
    
    G.add_edge(company, '', color='F5F5EB')
    G.add_node(company, size=count*4.5, title=hover_info, color='F5F5EB')

##### add to pyvis
nt = net.Network(height='700px', width='100%', bgcolor='#222222', font_color='white')
nt.from_nx(G)
nt.hrepulsion()
nt.set_edge_smooth('cubicBezier')
nt.write_html('company_graph.html')
# nt.show('company_graph.html')
# # display(HTML('company_graph.html'))

### Export to HTML and PDF

In [None]:
transcribe_html('company_graph.html', output_html_fname, 'network', '', None, html_template_fname)
transcribe_html('chart1.html', output_html_fname, 'chart', '"', 'class', None, '(chart1)', '(chart1_script)')
transcribe_html('chart2.html', output_html_fname, 'chart', '"', 'class', None, '(chart2)', '(chart2_script)')

In [None]:
if bool_to_pdf:
    with open(output_html_fname, 'r', encoding='utf-8') as f:
        html_file = f.read()
    html_string = str(BeautifulSoup(html_file, 'html.parser'))
    convert_html_to_pdf(html_string, pdf_fname)