In [1]:
import networkx as nx
import pandas as pd
G=nx.DiGraph()

In [2]:
data = pd.read_csv('influence_data.csv')

In [3]:
data

Unnamed: 0,influencer_id,influencer_name,influencer_main_genre,influencer_active_start,follower_id,follower_name,follower_main_genre,follower_active_start
0,759491,The Exploited,Pop/Rock,1980,74,Special Duties,Pop/Rock,1980
1,25462,Tricky,Electronic,1990,335,PJ Harvey,Pop/Rock,1990
2,66915,Bob Dylan,Pop/Rock,1960,335,PJ Harvey,Pop/Rock,1990
3,71209,Leonard Cohen,Pop/Rock,1950,335,PJ Harvey,Pop/Rock,1990
4,91438,The Gun Club,Pop/Rock,1980,335,PJ Harvey,Pop/Rock,1990
...,...,...,...,...,...,...,...,...
42765,580300,Sufjan Stevens,Pop/Rock,1990,3661738,Rosemary & Garlic,Pop/Rock,2010
42766,261309,Vybz Kartel,Reggae,2000,3670556,Trinidad Cardona,R&B;,2010
42767,467203,Michael Jackson,R&B;,1960,3670556,Trinidad Cardona,R&B;,2010
42768,2518003,Popcaan,Reggae,2000,3670556,Trinidad Cardona,R&B;,2010


In [4]:
# Set up a DataFrame to look up the genre and active_start_decade of the musicians
influencer_info = data.iloc[:,[0,2,3]].set_index('influencer_id')

influencer_info =influencer_info.T.rename(
    index={'influencer_main_genre':'genre','influencer_active_start':'decade'}).T
follower_info = data.iloc[:,[4,6,7]].set_index('follower_id')

follower_info = follower_info.T.rename(
    index={'follower_main_genre':'genre','follower_active_start':'decade'}).T
musician_info=pd.concat([influencer_info,follower_info])
musician_info = musician_info.groupby(musician_info.index).first()

In [6]:
for index, row in data.iterrows():
    G.add_edge(row.influencer_id, row.follower_id)

In [7]:
genres = {}
for node in list(G.nodes):
    genre = set()
    T = nx.bfs_tree(G, source=node, depth_limit=3)
    for musician in list(T):
        genre.add(musician_info.loc[musician,'genre'])
    genres[node]=len(genre)-1

In [10]:
genres_diversity = pd.Series(genres, name='genres_diversity')

In [11]:
distance = []
Index = []
for source in G.nodes():
    distance.append(len(nx.descendants_at_distance(G, source, 1)))
    Index.append(source)
descendants_at_distance1 = pd.Series(distance, index=Index, name='descendants_at_distance1')

In [12]:
distance = []
Index = []
for source in G.nodes():
    distance.append(len(nx.descendants_at_distance(G, source, 2)))
    Index.append(source)
descendants_at_distance2 = pd.Series(distance, index=Index, name='descendants_at_distance2')+descendants_at_distance1

In [13]:
distance = []
Index = []
for source in G.nodes():
    distance.append(len(nx.descendants_at_distance(G, source, 3)))
    Index.append(source)
descendants_at_distance3 = pd.Series(distance, index=Index)+descendants_at_distance2
descendants_at_distance3

759491      273
74            0
25462       248
335         191
66915      3830
           ... 
3661738       0
261309        1
3670556       0
2518003       1
2896351       1
Length: 5603, dtype: int64

In [14]:
centrality = nx.betweenness_centrality(G)
betweenness_centrality = pd.Series(centrality)
betweenness_centrality

759491     0.000122
74         0.000000
25462      0.000950
335        0.000590
66915      0.007030
             ...   
3661738    0.000000
261309     0.000000
3670556    0.000000
2518003    0.000000
2896351    0.000000
Length: 5603, dtype: float64

In [15]:
eigenvector_centrality = nx.eigenvector_centrality(G)
eigenvector_centrality = pd.Series(eigenvector_centrality)
eigenvector_centrality

759491     2.213044e-03
74         6.119629e-04
25462      4.583752e-03
335        6.684170e-03
66915      8.517328e-07
               ...     
3661738    3.218262e-03
261309     5.882585e-30
3670556    2.247508e-05
2518003    5.882585e-30
2896351    5.882585e-30
Length: 5603, dtype: float64

In [16]:
df=pd.DataFrame({'descendants_at_distance3':descendants_at_distance3,'betweenness_centrality':betweenness_centrality,
                'eigenvector_centrality':eigenvector_centrality,'genres_diversity':genres_diversity})
df

Unnamed: 0,descendants_at_distance3,betweenness_centrality,eigenvector_centrality,genres_diversity
759491,273,0.000122,2.213044e-03,3
74,0,0.000000,6.119629e-04,0
25462,248,0.000950,4.583752e-03,7
335,191,0.000590,6.684170e-03,8
66915,3830,0.007030,8.517328e-07,18
...,...,...,...,...
3661738,0,0.000000,3.218262e-03,0
261309,1,0.000000,5.882585e-30,1
3670556,0,0.000000,2.247508e-05,0
2518003,1,0.000000,5.882585e-30,1


In [17]:
df.corr()

Unnamed: 0,descendants_at_distance3,betweenness_centrality,eigenvector_centrality,genres_diversity
descendants_at_distance3,1.0,0.25389,-0.139423,0.836247
betweenness_centrality,0.25389,1.0,-0.018232,0.214467
eigenvector_centrality,-0.139423,-0.018232,1.0,-0.210886
genres_diversity,0.836247,0.214467,-0.210886,1.0


In [18]:
df.to_csv('q1_attribute.csv')