In [74]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind


# 1.Identify and load a network dataset that has some categorical information available for each node.

# Read the data

In [3]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,usertype,gender,from_station_name,to_station_name
0,Subscriber,Male,Lincoln Ave & Belmont Ave,Broadway & Cornelia Ave
1,Subscriber,Male,Halsted St & Maxwell St,May St & Taylor St
2,Subscriber,Male,Sheffield Ave & Webster Ave,Halsted St & Dickens Ave
3,Subscriber,Female,Peoria St & Jackson Blvd,State St & Wacker Dr
4,Subscriber,Female,Loomis St & Lexington St,Peoria St & Jackson Blvd


In [4]:
df.groupby(['gender']).count()

Unnamed: 0_level_0,usertype,from_station_name,to_station_name
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,259530,259530,259530
Male,789045,789045,789045


In [10]:
# Rename columns
df.columns = ['usertype','gender','source','target']

In [26]:
df.head()

Unnamed: 0,usertype,gender,source,target
0,Subscriber,Male,Lincoln Ave & Belmont Ave,Broadway & Cornelia Ave
1,Subscriber,Male,Halsted St & Maxwell St,May St & Taylor St
2,Subscriber,Male,Sheffield Ave & Webster Ave,Halsted St & Dickens Ave
3,Subscriber,Female,Peoria St & Jackson Blvd,State St & Wacker Dr
4,Subscriber,Female,Loomis St & Lexington St,Peoria St & Jackson Blvd


In [11]:
G = nx.from_pandas_edgelist(df)

# 2.For each of the nodes in the dataset, calculate degree centrality and eigenvector centrality.

In [34]:
# Degree centrality
nx.degree_centrality(G)

{'Lincoln Ave & Belmont Ave': 0.6258064516129033,
 'Broadway & Cornelia Ave': 0.6258064516129033,
 'Halsted St & Maxwell St': 0.5709677419354838,
 'May St & Taylor St': 0.567741935483871,
 'Sheffield Ave & Webster Ave': 0.7,
 'Halsted St & Dickens Ave': 0.7290322580645161,
 'Peoria St & Jackson Blvd': 0.6935483870967742,
 'State St & Wacker Dr': 0.6709677419354839,
 'Loomis St & Lexington St': 0.49032258064516127,
 'Halsted St & Diversey Pkwy': 0.7451612903225806,
 'Greenview Ave & Diversey Pkwy': 0.567741935483871,
 'Canal St & Madison St': 0.8161290322580645,
 'State St & Randolph St': 0.7903225806451613,
 'Wabash Ave & Grand Ave': 0.7935483870967742,
 'Fairbanks Ct & Grand Ave': 0.6838709677419355,
 'Ravenswood Ave & Montrose Ave': 0.43870967741935485,
 'Damen Ave & Sunnyside Ave': 0.4258064516129032,
 'Canal St & Harrison St': 0.5161290322580645,
 'Clinton St & Washington Blvd': 0.8064516129032258,
 'Orleans St & Merchandise Mart Plaza': 0.8096774193548387,
 'Canal St & Adams St': 

In [35]:
# Eigenvector centrality
nx.eigenvector_centrality(G)

{'Lincoln Ave & Belmont Ave': 0.060587910972776375,
 'Broadway & Cornelia Ave': 0.060200588932918776,
 'Halsted St & Maxwell St': 0.05073118069312014,
 'May St & Taylor St': 0.05037287387767253,
 'Sheffield Ave & Webster Ave': 0.06843473842507594,
 'Halsted St & Dickens Ave': 0.07029814475196873,
 'Peoria St & Jackson Blvd': 0.06546964577064825,
 'State St & Wacker Dr': 0.06401156956267996,
 'Loomis St & Lexington St': 0.04511978285329785,
 'Halsted St & Diversey Pkwy': 0.07132041371282404,
 'Greenview Ave & Diversey Pkwy': 0.05540975815265722,
 'Canal St & Madison St': 0.07546565533231807,
 'State St & Randolph St': 0.07265032897141951,
 'Wabash Ave & Grand Ave': 0.07481918931974976,
 'Fairbanks Ct & Grand Ave': 0.06517636900259227,
 'Ravenswood Ave & Montrose Ave': 0.04031564695172178,
 'Damen Ave & Sunnyside Ave': 0.03864170487619201,
 'Canal St & Harrison St': 0.047319249311324606,
 'Clinton St & Washington Blvd': 0.0739216716952794,
 'Orleans St & Merchandise Mart Plaza': 0.075640

# 3.Compare your centrality measures across your categorical groups.

In [69]:
# Make two set of the data
df_male = df.loc[df.gender == "Male",:]
df_female = df.loc[df.gender == "Female",:]

## Degree centrality comparison

Null Hypothesis: There is no difference between the degree centrality of males and females

Alternatiee Hypothesis: There is significant difference between the degree centrality of males and females
     

In [81]:
G_male = nx.from_pandas_edgelist(df_male)
G_female = nx.from_pandas_edgelist(df_female)
dv_male = nx.degree_centrality(G_male)
dv_female = nx.degree_centrality(G_female)

deg_male = pd.DataFrame(pd.Series(dv_male))
deg_male.head()

deg_female = pd.DataFrame(pd.Series(dv_female))
deg_female.head()

# Perfrom t test of the  degree centrality measure
ttest_ind(deg_male[0].values, deg_female[0].values)

Ttest_indResult(statistic=7.110951988850095, pvalue=3.18645417239173e-12)

Comment: Since the p value less than 0.05 we  reject the null hypothesis. Hence we conclude that there is significant difference between the degree centreality measure of males and females.

## Eigen Centrality Comparison

Null Hypothesis: There is no difference between the eigenvector centrality of males and females
    
Alternatiee Hypothesis: There is significant difference between the eigenvector centrality of males and females
     

In [82]:
dv_male = nx.eigenvector_centrality(G_male)
dv_female = nx.eigenvector_centrality(G_female)

deg_male = pd.DataFrame(pd.Series(dv_male))
deg_male.head()

deg_female = pd.DataFrame(pd.Series(dv_female))
deg_female.head()

# Perfrom t test of the  degree centrality measure
ttest_ind(deg_male[0].values, deg_female[0].values)

Ttest_indResult(statistic=0.6913973375173805, pvalue=0.4895746771157278)

Comment: Since the p value greater than 0.05 we can't reject the null hypothesis. Hence we conclude that there is no significant difference between the eigenvector centreality measure of males and females.