# Import the packages

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stt
import networkx as nx 

# Functions

These are functions that will be used for the different operations of building the network

In [2]:
# Calculates the correlation between two arrays excluding the nan
def corr_nan(x,y):
    x = np.array(x)
    y = np.array(y)
    
    m = np.isnan(x) + np.isnan(y)
    mm = np.logical_not(m)
    
    [r,p] = stt.pearsonr(x[mm],y[mm])
    
    return r

In [3]:
def get_col_values(keys, df):
    
    cols = list()
    
    if type(keys) == type("abc"):
        keys = [keys]
        
    for key in keys:
        col = df[key].values
        cols.append(col)
    
    if len(cols) == 1:
        cols = cols[0]
        
    return cols
        

In [4]:
def check_element_to_remove(to_remove, string):
    main_var = np.nan
    if type(to_remove) == type('sdf'):
        if to_remove in string:
            main_var = True
        else:
            main_var = False
    elif type(to_remove) == type(["a","b"]):
        main_var = False
        for rem in to_remove:
            if rem in string:
                main_var = True
    return main_var


In [5]:
# builds the network from the (dummy coded) dataframe df
# metrics is the function that will be used for calculating the correlation
# exclude_same_question is used for not calculating correlation between responses of the same item
# remove_name is used for removing some specific entries (e.g. answer rejected)
# list_of_nodes is the list of the columns that will be turned into nodes
def make_graph_(df, list_of_nodes, metrics, exclude_same_question=True, print_=False, remove_name=''):
    G = nx.Graph()
    
    if remove_name == '': # should we remove some elements
        remove_el = False
    else:
        remove_el = True
    
    for i, node_i in enumerate(list_of_nodes):
        
        if remove_el and (check_element_to_remove(remove_name, node_i)):# this is one of the elements to remove
            if print_:
                print("Skipping : ", node_i)
        else:
            for j, node_j in enumerate(list_of_nodes):

                if remove_el and (check_element_to_remove(remove_name, node_j)):
                    if print_:
                        print("Skipping : ", node_i)
                    
                else:
                    if print_:
                        n_tot = len(list_of_nodes)**2
                        current = (i*len(list_of_nodes))+(j)
                        print(current,"/",n_tot, " = ", np.round(current/n_tot,decimals=2))

                    if j <= i:
                        continue

                    if exclude_same_question:
                        if node_i.split(sep=':')[0] == node_j.split(sep=':')[0]:
                            # if they belong to the same question
                            continue

                    [c1,c2] = get_col_values([node_i,node_j], df)
                    weight = metrics(c1,c2)
#                     print(weight)
                    if weight < 0:
                        weight = 0
                    G.add_weighted_edges_from([(node_i,node_j,weight)])
    return G

# Import and clean the data

Here you can import your dataset

In [6]:
filename = "dataset_to_open"
df = pd.read_csv(filename+".csv")

# Clean the data

Eventually clean your dataset. 

Specifically, you cannot use continuous variables as the dummy coding will generate as many new variables as the number of unique values

If you have continuous variables or too many entries you can bin them

In [7]:
# Identify which columns are needed for the analysis
useful_cols = ["Col 1", "Col 2", "Col 3"]

# Here I am binning the columns into a 5 point scale 
for col in useful_cols:
    df[col] = np.round(df[col]*5)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Col 1,Col 2,Col 3
0,0,4.0,4.0,2.0
1,1,4.0,2.0,4.0
2,2,2.0,0.0,3.0
3,3,2.0,3.0,0.0
4,4,1.0,2.0,2.0


# Dummy code the survey

This will create a new dataframe called `df_bool` which contains the dummy coded version of the columns you selected in `list_of_questions`

In [10]:
list_of_questions = useful_cols #### <--- List of columns you would like to dummy code

df_bool = pd.DataFrame()
list_of_attitudes = []

for quest in list_of_questions:
    values = (df[quest].unique()) 
    
    for value in values:
        if type(value) == type('dsf'):
            name = str(quest)+":"+str(value) 
            df_bool[name] = df[quest] == value
            list_of_attitudes.append(name)           
        else:
            if np.isnan(value): # if it's a refused answer
                name = str(quest)+":"+"Ref" 
                try:
                    df_bool[name] = df_bool[name] | df[quest] == value
                except:
                    list_of_attitudes.append(name)
                    df_bool[name] = df[quest] == value
            else:
                name = str(quest)+":"+str(value) 
                df_bool[name] = df[quest] == value
                list_of_attitudes.append(name)


In [11]:
df_bool.head()

Unnamed: 0,Col 1:4.0,Col 1:2.0,Col 1:1.0,Col 1:3.0,Col 1:5.0,Col 1:0.0,Col 2:4.0,Col 2:2.0,Col 2:0.0,Col 2:3.0,Col 2:1.0,Col 2:5.0,Col 3:2.0,Col 3:4.0,Col 3:3.0,Col 3:0.0,Col 3:1.0,Col 3:5.0
0,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
1,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False
2,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False
3,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False


# Make the network

This code will produce a network `G` out of `df_bool`

In [13]:
remove_name = None# ["essround","cntry","Ref"]

metrics = corr_nan # the correlation function to use

list_of_nodes = df_bool.columns # columns_for_survey # 
df_t = df_bool

G = make_graph_(df_t, list_of_nodes, metrics, exclude_same_question=True, print_=True)

0 / 324  =  0.0
1 / 324  =  0.0
2 / 324  =  0.01
3 / 324  =  0.01
4 / 324  =  0.01
5 / 324  =  0.02
6 / 324  =  0.02
7 / 324  =  0.02
8 / 324  =  0.02
9 / 324  =  0.03
10 / 324  =  0.03
11 / 324  =  0.03
12 / 324  =  0.04
13 / 324  =  0.04
14 / 324  =  0.04
15 / 324  =  0.05
16 / 324  =  0.05
17 / 324  =  0.05
18 / 324  =  0.06
19 / 324  =  0.06
20 / 324  =  0.06
21 / 324  =  0.06
22 / 324  =  0.07
23 / 324  =  0.07
24 / 324  =  0.07
25 / 324  =  0.08
26 / 324  =  0.08
27 / 324  =  0.08
28 / 324  =  0.09
29 / 324  =  0.09
30 / 324  =  0.09
31 / 324  =  0.1
32 / 324  =  0.1
33 / 324  =  0.1
34 / 324  =  0.1
35 / 324  =  0.11
36 / 324  =  0.11
37 / 324  =  0.11
38 / 324  =  0.12
39 / 324  =  0.12
40 / 324  =  0.12
41 / 324  =  0.13
42 / 324  =  0.13
43 / 324  =  0.13
44 / 324  =  0.14
45 / 324  =  0.14
46 / 324  =  0.14
47 / 324  =  0.15
48 / 324  =  0.15
49 / 324  =  0.15
50 / 324  =  0.15
51 / 324  =  0.16
52 / 324  =  0.16
53 / 324  =  0.16
54 / 324  =  0.17
55 / 324  =  0.17
56 / 324

In [14]:
# G.nodes

# Save the network

Save the network so you can import it in other softwares (e.g. Gephy)

In [15]:
filename = 'title'
output_path = ''
nx.write_gexf(G,output_path+filename+'.gexf')