In [54]:
import plotly.figure_factory as ff
import plotly.graph_objs as go
import pandas as pd
import numpy as np

In [55]:
data = pd.read_csv('../data/imdb_all_v2.csv')

In [56]:
data["genre"] = data["genre"].str.split()

In [57]:
data2 = data.explode('genre')

In [58]:
def move_particle(x):
    if x[-1] == ',':
        return x[:-1]
    else:
        return x

In [59]:
data2['genre'] = data2['genre'].apply(move_particle)

In [60]:
data2['genre'].unique()

array(['Action', 'Adventure', 'Fantasy', 'Drama', 'Comedy', 'Sci-Fi',
       'Mystery', 'Thriller', 'Crime', 'Animation', 'Family', 'Horror',
       'Romance', 'Western', 'History', 'Biography', 'Music', 'Musical',
       'Sport', 'Documentary', 'War', 'Short', 'News', 'Reality-TV',
       'Game-Show', 'Talk-Show', 'Film-Noir'], dtype=object)

In [61]:
s = pd.crosstab(data2['title'],data2['genre'])
s = s.T.dot(s).astype(float)
s.values[np.triu_indices(len(s))]=np.nan
films_intersection = s.stack()

In [67]:
data2.groupby('genre')['title'].count().sort_values()

genre
Reality-TV        1
Game-Show         3
Talk-Show         4
News             16
Documentary     123
Sport           140
Music           243
Film-Noir       311
Musical         313
Western         347
History         383
War             389
Biography       432
Family          638
Short           659
Animation       696
Fantasy         736
Sci-Fi          742
Mystery         843
Horror         1058
Thriller       1065
Crime          1736
Adventure      1829
Romance        1843
Action         1844
Comedy         3588
Drama          5252
Name: title, dtype: int64

In [71]:
genre_list = data2.groupby('genre')['title'].count().sort_values(ascending=False)[:15].index.tolist()

In [72]:
data3 = data2[data2['genre'].isin(genre_list)]


In [73]:
data3.groupby('genre')['title'].count().sort_values()

genre
Biography     432
Family        638
Short         659
Animation     696
Fantasy       736
Sci-Fi        742
Mystery       843
Horror       1058
Thriller     1065
Crime        1736
Adventure    1829
Romance      1843
Action       1844
Comedy       3588
Drama        5252
Name: title, dtype: int64

In [74]:
s = pd.crosstab(data3['title'], data3['genre'])
s = s.T.dot(s).astype(float)
s.values[np.triu_indices(len(s))] = np.nan
films_intersection = s.stack()

In [76]:
for i, v in films_intersection.iteritems():
    print('index: ', i, 'value: ', v)


index:  ('Adventure', 'Action') value:  2264.0
index:  ('Animation', 'Action') value:  122.0
index:  ('Animation', 'Adventure') value:  351.0
index:  ('Biography', 'Action') value:  40.0
index:  ('Biography', 'Adventure') value:  40.0
index:  ('Biography', 'Animation') value:  3.0
index:  ('Comedy', 'Action') value:  400.0
index:  ('Comedy', 'Adventure') value:  583.0
index:  ('Comedy', 'Animation') value:  447.0
index:  ('Comedy', 'Biography') value:  41.0
index:  ('Crime', 'Action') value:  518.0
index:  ('Crime', 'Adventure') value:  124.0
index:  ('Crime', 'Animation') value:  8.0
index:  ('Crime', 'Biography') value:  66.0
index:  ('Crime', 'Comedy') value:  434.0
index:  ('Drama', 'Action') value:  761.0
index:  ('Drama', 'Adventure') value:  702.0
index:  ('Drama', 'Animation') value:  74.0
index:  ('Drama', 'Biography') value:  409.0
index:  ('Drama', 'Comedy') value:  1127.0
index:  ('Drama', 'Crime') value:  1195.0
index:  ('Family', 'Action') value:  67.0
index:  ('Family', 

In [83]:
number_of_genres= len(data3['genre'].unique())
genre_matrix = np.zeros((number_of_genres,number_of_genres))
genre_list = data3['genre'].unique().tolist()

In [84]:
print(genre_list)

['Action', 'Adventure', 'Fantasy', 'Drama', 'Comedy', 'Sci-Fi', 'Mystery', 'Thriller', 'Crime', 'Animation', 'Family', 'Horror', 'Romance', 'Biography', 'Short']


In [89]:
for i, v in films_intersection.iteritems():
    to = genre_list.index(i[0])
    frm = genre_list.index(i[1])
    value = v
    genre_matrix[to,frm] = v
    genre_matrix[frm,to] = v
    
genre_matrix = genre_matrix.astype(int)

In [91]:
print(genre_matrix.shape)

(15, 15)


In [96]:
#make ideogram
PI=np.pi


def check_data(data_matrix):
    L, M = data_matrix.shape
    if L != M:
        raise ValueError('Data array must have (n,n) shape')
    return L


L = check_data(genre_matrix)

def moduloAB(x, a, b): #maps a real number onto the unit circle identified with 
                       #the interval [a,b), b-a=2*PI
        if a>=b:
            raise ValueError('Incorrect interval ends')
        y=(x-a)%(b-a)
        return y+b if y<0 else y+a

def test_2PI(x):
    return 0<= x <2*PI

In [105]:
row_sum=[np.sum(genre_matrix[k,:]) for k in range(L)]

#set the gap between two consecutive ideograms
gap=2*PI*0.005
ideogram_length=2*PI*np.asarray(row_sum)/sum(row_sum)-gap*np.ones(L)
print(ideogram_length)

[0.5875833  0.58680991 0.68764018 1.27183701 0.4886865  0.33091565
 0.12587155 0.16067395 0.28886276 0.11823436 0.11021048 0.69895095
 0.25154686 0.03190509 0.07221787]


In [100]:
def get_ideogram_ends(ideogram_len, gap):
    ideo_ends = []
    left = 0
    for k in range(len(ideogram_len)):
        right = left+ideogram_len[k]
        ideo_ends.append([left, right])
        left = right+gap
    return ideo_ends


ideo_ends = get_ideogram_ends(ideogram_length, gap)
print(ideo_ends)


def make_ideogram_arc(R, phi, a=50):
    # R is the circle radius
    # phi is the list of ends angle coordinates of an arc
    # a is a parameter that controls the number of points to be evaluated on an arc
    if not test_2PI(phi[0]) or not test_2PI(phi[1]):
        phi = [moduloAB(t, 0, 2*PI) for t in phi]
    length = (phi[1]-phi[0]) % 2*PI
    nr = 5 if length <= PI/4 else int(a*length/PI)

    if phi[0] < phi[1]:
        theta = np.linspace(phi[0], phi[1], nr)
    else:
        phi = [moduloAB(t, -PI, PI) for t in phi]
        theta = np.linspace(phi[0], phi[1], nr)
    return R*np.exp(1j*theta)


z = make_ideogram_arc(1.3, [11*PI/6, PI/17])
print(z)


[[0, 0.5875832968058088], [0.6189992233417068, 1.2058091336432624], [1.2372250601791603, 1.9248652359727352], [1.9562811625086332, 3.228118168952518], [3.2595340954884158, 3.7482205930628383], [3.779636519598736, 4.110552170305493], [4.14196809684139, 4.267839650608], [4.299255577143898, 4.459929523601904], [4.491345450137802, 4.780208209675787], [4.811624136211685, 4.929858498248794], [4.961274424784691, 5.071484901840172], [5.10290082837607, 5.8018517817943485], [5.833267708330246, 6.084814569038011], [6.116230495573909, 6.148135589073747], [6.179551515609645, 6.251769380643684]]
[1.12583302-0.65j       1.14814501-0.60972373j 1.16901672-0.5686826j
 1.18842197-0.5269281j  1.20633642-0.48451259j 1.22273759-0.44148929j
 1.23760491-0.39791217j 1.25091973-0.3538359j  1.26266534-0.30931575j
 1.27282702-0.26440759j 1.28139202-0.21916775j 1.28834958-0.17365297j
 1.29369099-0.12792036j 1.29740954-0.08202728j 1.29950058-0.0360313j
 1.29996146+0.01000988j 1.29879163+0.0560385j  1.29599253+0.101

In [101]:
ideo_colors=['rgba(187, 255, 255, 0.75)',
             'rgba(127, 255, 212, 0.75)',
             'rgba(155, 205, 155, 0.75)',
             'rgba(217, 239, 139, 0.75)',
             'rgba(255, 246, 143, 0.75)',
             'rgba(255, 193, 193, 0.75)',
             'rgba(255, 130, 71, 0.75)',
             'rgba(255, 48, 48, 0.75)',
             'rgba(255, 20, 147, 0,75)',
             'rgba(72, 118, 255, 0,75)',
             'rgba(176, 226, 255, 0.75)',
             'rgba(224, 255, 255, 0.75)',
             'rgba(144, 238, 144, 0.75)',
             'rgba(106, 90, 205, 0,75)',
             'rgba(47, 79, 79, 0,75)']#brewe

In [104]:
def map_data(data_matrix, row_value, ideogram_length):
    mapped = np.zeros(data_matrix.shape)
    for j in range(L):
        mapped[:, j] = ideogram_length*data_matrix[:, j]/row_value
    return mapped


mapped_data = map_data(genre_matrix, row_sum, ideogram_length)
mapped_data


array([[0.00000000e+00, 2.07760204e-01, 1.49580005e-02, 6.98345914e-02,
        3.67067498e-02, 1.36549109e-01, 5.68954621e-03, 2.96407004e-02,
        4.75352409e-02, 1.11955587e-02, 6.14838059e-03, 9.63552181e-03,
        7.43311683e-03, 3.67067498e-03, 8.25901870e-04],
       [2.07746308e-01, 0.00000000e+00, 2.44083559e-02, 6.44160371e-02,
        5.34965094e-02, 1.30392007e-01, 3.57866873e-03, 9.35959513e-03,
        1.13783313e-02, 3.22080185e-02, 2.22060982e-02, 6.23973009e-03,
        1.34888283e-02, 3.67042946e-03, 4.22099388e-03],
       [1.50692859e-02, 2.45915954e-02, 0.00000000e+00, 2.98519781e-01,
        2.33897505e-02, 2.68103860e-03, 3.88288349e-03, 1.84899214e-03,
        2.21879056e-03, 4.34513152e-03, 1.59937820e-02, 2.84837239e-01,
        7.11861973e-03, 4.62248034e-04, 2.68103860e-03],
       [7.17949679e-02, 6.62287351e-02, 3.04633313e-01, 0.00000000e+00,
        1.06324479e-01, 1.70760699e-02, 4.26430033e-02, 4.52846052e-02,
        1.12739798e-01, 6.98137664e-0

In [106]:
idx_sort = np.argsort(mapped_data, axis=1)
idx_sort


array([[ 0, 14, 13,  6, 10, 12, 11,  9,  2,  7,  4,  8,  3,  5,  1],
       [ 1,  6, 13, 14, 11,  7,  8, 12, 10,  2,  9,  4,  3,  5,  0],
       [ 2, 13,  7,  8,  5, 14,  6,  9, 12,  0, 10,  4,  1, 11,  3],
       [ 3, 14,  9,  5, 10, 13,  6,  7,  1,  0,  4,  8, 12,  2, 11],
       [ 4, 13,  7,  6,  5, 11,  2,  0, 10,  8,  9, 14,  1, 12,  3],
       [ 5, 13, 14,  9, 10,  2, 12,  8,  6,  7,  4,  3, 11,  1,  0],
       [ 6,  9, 13, 14, 10,  1,  2, 12,  0,  5,  4,  7, 11,  8,  3],
       [ 7,  9, 10, 14, 13,  2,  4, 12,  5,  1,  6, 11,  0,  8,  3],
       [ 8,  9, 10, 14,  2,  5, 13, 11,  1, 12,  6,  7,  4,  0,  3],
       [ 6,  7,  9, 13,  5,  8, 11, 12,  2,  3,  0, 10, 14,  1,  4],
       [10,  7,  6, 11,  8, 13,  5, 12,  0, 14,  9,  2,  3,  1,  4],
       [11, 13,  9, 10, 14, 12,  1,  8,  0,  4,  5,  6,  7,  2,  3],
       [12,  9, 14, 13,  5, 11,  7,  6, 10,  2,  0,  8,  1,  4,  3],
       [ 5, 11, 13,  6, 14,  9,  2,  7, 10, 12,  0,  1,  4,  8,  3],
       [14,  7, 13,  6,  5,  0,  8

In [111]:
def make_ribbon_ends(mapped_data, ideo_ends,  idx_sort):
    L = mapped_data.shape[0]
    ribbon_boundary = np.zeros((L, L+1))
    for k in range(L):
        start = ideo_ends[k][0]
        ribbon_boundary[k][0] = start
        for j in range(1, L+1):
            J = idx_sort[k][j-1]
            ribbon_boundary[k][j] = start+mapped_data[k][J]
            start = ribbon_boundary[k][j]
    return [[(ribbon_boundary[k][j], ribbon_boundary[k][j+1]) for j in range(L)] for k in range(L)]


ribbon_ends = make_ribbon_ends(mapped_data, ideo_ends,  idx_sort)
print('ribbon ends starting from the ideogram[2]\n', ribbon_ends[2])


ribbon ends starting from the ideogram[2]
 [(1.2372250601791603, 1.2372250601791603), (1.2372250601791603, 1.2376873082134394), (1.2376873082134394, 1.2395363003505557), (1.2395363003505557, 1.2417550909150954), (1.2417550909150954, 1.2444361295139141), (1.2444361295139141, 1.2471171681127329), (1.2471171681127329, 1.2510000516006772), (1.2510000516006772, 1.2553451831229006), (1.2553451831229006, 1.2624638028507986), (1.2624638028507986, 1.277533088768297), (1.277533088768297, 1.2935268707543535), (1.2935268707543535, 1.3169166212888754), (1.3169166212888754, 1.341508216712523), (1.341508216712523, 1.6263454554352985), (1.6263454554352985, 1.9248652359727352)]


15

In [112]:
def control_pts(angle, radius):
    #angle is a  3-list containing angular coordinates of the control points b0, b1, b2
    #radius is the distance from b1 to the  origin O(0,0)

    if len(angle) != 3:
        raise InvalidInputError('angle must have len =3')
    b_cplx = np.array([np.exp(1j*angle[k]) for k in range(3)])
    b_cplx[1] = radius*b_cplx[1]
    return zip(b_cplx.real, b_cplx.imag)


In [113]:
def ctrl_rib_chords(l, r, radius):
    # this function returns a 2-list containing control poligons of the two quadratic Bezier
    #curves that are opposite sides in a ribbon
    #l (r) the list of angular variables of the ribbon arc ends defining
    #the ribbon starting (ending) arc
    # radius is a common parameter for both control polygons
    if len(l) != 2 or len(r) != 2:
        raise ValueError('the arc ends must be elements in a list of len 2')
    return [control_pts([l[j], (l[j]+r[j])/2, r[j]], radius) for j in range(2)]


In [114]:
ribbon_color = [L*[ideo_colors[k]] for k in range(L)]


In [144]:
def make_q_bezier(b):  # defines the Plotly SVG path for a quadratic Bezier curve defined by the
    #list of its control points
    # if len(b) != 3:
    #     raise valueError('control poligon must have 3 points')
    A, B, C = b
    return 'M '+str(A[0])+',' + str(A[1])+' '+'Q ' +\
        str(B[0])+', '+str(B[1]) + ' ' +\
        str(C[0])+', '+str(C[1])


b = [(1, 4), (-0.5, 2.35), (3.745, 1.47)]

make_q_bezier(b)


'M 1,4 Q -0.5, 2.35 3.745, 1.47'

In [145]:
def make_ribbon_arc(theta0, theta1):
    
    if test_2PI(theta0) and test_2PI(theta1):
        if theta0 < theta1:
            theta0= moduloAB(theta0, -PI, PI)
            theta1= moduloAB(theta1, -PI, PI)
            if theta0*theta1>0:
                raise ValueError('incorrect angle coordinates for ribbon')

        nr=int(40*(theta0-theta1)/PI)
        if nr<=2: nr=3
        theta=np.linspace(theta0, theta1, nr)
        pts=np.exp(1j*theta)# points on arc in polar complex form

        string_arc=''
        for k in range(len(theta)):
            string_arc+='L '+str(pts.real[k])+', '+str(pts.imag[k])+' '
        return   string_arc
    else:
        raise ValueError('the angle coordinates for an arc side of a ribbon must be in [0, 2*pi]')

make_ribbon_arc(np.pi/3, np.pi/6)

'L 0.5000000000000001, 0.8660254037844386 L 0.5877852522924732, 0.8090169943749473 L 0.6691306063588583, 0.7431448254773941 L 0.7431448254773942, 0.6691306063588581 L 0.8090169943749475, 0.5877852522924731 L 0.8660254037844387, 0.49999999999999994 '

In [146]:
def make_layout(title, plot_size):
    axis = dict(showline=False,  # hide axis line, grid, ticklabels and  title
                zeroline=False,
                showgrid=False,
                showticklabels=False,
                title=''
                )

    return go.Layout(title=title,
                     xaxis=dict(axis),
                     yaxis=dict(axis),
                     showlegend=False,
                     width=plot_size,
                     height=plot_size,
                     margin=dict(t=25, b=25, l=25, r=25),
                     hovermode='closest'  # to this list one appends below the dicts defining the ribbon,
                     #respectively the ideogram shapes
                     )


In [147]:
def make_ideo_shape(path, line_color, fill_color):
    #line_color is the color of the shape boundary
    #fill_collor is the color assigned to an ideogram
    return dict(
        line=dict(
            color=line_color,
            width=0.45
        ),

        path=path,
        type='path',
        fillcolor=fill_color,
        layer='below'
    )


In [154]:
def make_ribbon(l, r, line_color, fill_color, radius=0.2):
    #l=[l[0], l[1]], r=[r[0], r[1]]  represent the opposite arcs in the ribbon 
    #line_color is the color of the shape boundary
    #fill_color is the fill color for the ribbon shape
    poligon=ctrl_rib_chords(l,r, radius)
    b,c =poligon
    print(b,c)

    return  dict(
                line=dict(
                color=line_color, width=0.5
            ),
            path=  make_q_bezier(b)+make_ribbon_arc(r[0], r[1])+
                   make_q_bezier(c[::-1])+make_ribbon_arc(l[1], l[0]),
            type='path',
            fillcolor=fill_color,
            layer='below'
        )

def make_self_rel(l, line_color, fill_color, radius):
    #radius is the radius of Bezier control point b_1
    b=control_pts([l[0], (l[0]+l[1])/2, l[1]], radius)
    return  dict(
                line=dict(
                color=line_color, width=0.5
            ),
            path=  make_q_bezier(b)+make_ribbon_arc(l[1], l[0]),
            type='path',
            fillcolor=fill_color,
            layer='below'
        )

def invPerm(perm):
    # function that returns the inverse of a permutation, perm
    inv = [0] * len(perm)
    for i, s in enumerate(perm):
        inv[s] = i
    return inv

layout=make_layout('Chord diagram', 400)
layout['shapes']

()

In [155]:
# these value are set after a few trials
radii_sribb = [0.4, 0.30, 0.35, 0.39, 0.12]


In [156]:
ribbon_info = []
for k in range(L):

    sigma = idx_sort[k]
    sigma_inv = invPerm(sigma)
    for j in range(k, L):
        if genre_matrix[k][j] == 0 and genre_matrix[j][k] == 0:
            continue
        eta = idx_sort[j]
        eta_inv = invPerm(eta)
        l = ribbon_ends[k][sigma_inv[j]]

        if j == k:
            print(make_self_rel(l, 'rgb(175,175,175)',
                               ideo_colors[k], radius=radii_sribb[k]))
            layout['shapes'] = layout['shapes'] + make_self_rel(l, 'rgb(175,175,175)',
                                                                ideo_colors[k], radius=radii_sribb[k])
            z = 0.9*np.exp(1j*(l[0]+l[1])/2)
            #the text below will be displayed when hovering the mouse over the ribbon
            text = genre_list[k]+' commented on ' + \
                '{:d}'.format(genre_matrix[k][k])+' of ' + 'herself Fb posts',
            ribbon_info.append(go.Scatter(x=[z.real],
                                          y=[z.imag],
                                          mode='markers',
                                          marker=dict(
                                              size=0.5, color=ideo_colors[k]),
                                          text=text,
                                          hoverinfo='text'
                                          )
                               )
        else:
            r = ribbon_ends[j][eta_inv[k]]
            zi = 0.9*np.exp(1j*(l[0]+l[1])/2)
            zf = 0.9*np.exp(1j*(r[0]+r[1])/2)
            #texti and textf are the strings that will be displayed when hovering the mouse
            #over the two ribbon ends
            texti = genre_list[k]+' commented on ' + '{:d}'.format(genre_matrix[k][j])+' of ' +\
                genre_list[j] + ' Fb posts',

            textf = genre_list[j]+' commented on ' + '{:d}'.format(genre_matrix[j][k])+' of ' +\
                genre_list[k] + ' Fb posts',
            ribbon_info.append(go.Scatter(x=[zi.real],
                                          y=[zi.imag],
                                          mode='markers',
                                          marker=dict(
                                              size=0.5, color=ribbon_color[k][j]),
                                          text=texti,
                                          hoverinfo='text'
                                          )
                               ),
            ribbon_info.append(go.Scatter(x=[zf.real],
                                          y=[zf.imag],
                                          mode='markers',
                                          marker=dict(
                                              size=0.5, color=ribbon_color[k][j]),
                                          text=textf,
                                          hoverinfo='text'
                                          )
                               )
            # IMPORTANT!!!  Reverse these arc ends because otherwise you get
            r = (r[1], r[0])
            # a twisted ribbon
            print(make_ribbon(
                l, r, 'rgb(175,175,175)', ribbon_color[k][j]))
            #append the ribbon shape
            layout['shapes'] = layout['shapes'] +make_ribbon(
                l, r, 'rgb(175,175,175)', ribbon_color[k][j]) 


<zip object at 0x7fa4582ae600> <zip object at 0x7fa4582ae740>


TypeError: 'zip' object is not subscriptable

In [None]:
import plotly.offline as off
ideograms = []
for k in range(len(ideo_ends)):
    z = make_ideogram_arc(1.1, ideo_ends[k])
    zi = make_ideogram_arc(1.0, ideo_ends[k])
    m = len(z)
    n = len(zi)
    ideograms.append(go.Scatter(x=z.real,
                                y=z.imag,
                                mode='lines',
                                line=dict(
                                    color=ideo_colors[k], shape='spline', width=0.25),
                                text=labels[k]+'<br>' +
                                '{:d}'.format(row_sum[k]),
                                hoverinfo='text'
                                )
                     )

    path = 'M '
    for s in range(m):
        path += str(z.real[s])+', '+str(z.imag[s])+' L '

    Zi = np.array(zi.tolist()[::-1])

    for s in range(m):
        path += str(Zi.real[s])+', '+str(Zi.imag[s])+' L '
    path += str(z.real[0])+' ,'+str(z.imag[0])

    layout['shapes'].append(make_ideo_shape(
        path, 'rgb(150,150,150)', ideo_colors[k]))

data = go.Data(ideograms+ribbon_info)
fig = go.Figure(data=data, layout=layout)

off.init_notebook_mode()

off.iplot(fig, filename='chord-diagram-Fb')
