In [1]:
#Import needed packages 

import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import time 
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
import networkx.algorithms.community as nx_comm
import collections


In [2]:
time_start = time.time()

path = 'Cooffending.csv'
df = pd.read_csv(path)

print('Data loading into Dataframe is done Time elapsed {}'.format(time.time()-time_start))
print(df.dtypes)
df.head()


Data loading into Dataframe is done Time elapsed 1.4743671417236328
NoUnique      int64
Naissance     int64
SEXE         object
SeqE          int64
dateInf       int64
NCD1         object
NCD2         object
NCD3         object
NCD4         object
MUN           int64
ED1          object
Jeunes        int64
Adultes       int64
Date         object
annee         int64
dtype: object


Unnamed: 0,NoUnique,Naissance,SEXE,SeqE,dateInf,NCD1,NCD2,NCD3,NCD4,MUN,ED1,Jeunes,Adultes,Date,annee
0,1,1007,F,1085034,20051217,3530,,,,58227,2.0,0,1,12/17/2005,2005
1,2,1828,F,1431379,20080423,1430,,,,94068,5.0,0,1,04/23/2008,2008
2,4,1889,M,167174,20080306,1430,21702.0,,,49058,2.0,0,1,03/06/2008,2008
3,5,1892,M,1179096,20080821,1420,,,,65005,71.0,0,1,08/21/2008,2008
4,17,1897,M,1270690,20030430,1625,,,,23027,,0,3,04/30/2003,2003


In [3]:
#Check for any NaN values
print(df.isnull().values.any())
#Count how many duplicate values there are 
df.duplicated().sum()

False


467

In [4]:
#Drop all duplicate values keeping only the first instance
df.drop_duplicates(keep = 'first', inplace = True)
df.shape

(1279992, 15)

In [5]:
#Count the number of unique values in each column
print(df.nunique())


NoUnique      539593
Naissance        110
SEXE               2
SeqE         1164836
dateInf         2561
NCD1             295
NCD2             244
NCD3             178
NCD4             116
MUN             1342
ED1               99
Jeunes            14
Adultes           50
Date            2561
annee              8
dtype: int64


In [None]:
#Look at how many different values there are in the 4 crime rows to count the number of different crime types
CrimeTypes = pd.unique(df[['NCD1', 'NCD2', 'NCD3','NCD4']].values.ravel())
len(CrimeTypes)

In [None]:
df.loc[df['annee'] == 2010]


In [None]:
#Group crime events and count the number of offenders associated with each crime event
df_crimecount = df.groupby(['SeqE'])['NoUnique'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)

In [None]:
#Displaying crime events sorted from largest number of associated co-offenders to lowest number
df_crimecount.head()

In [32]:
#Group the crime events by municipality to view the municipality with the greatest number of crime events
df_city = df.groupby(['MUN'])['SeqE'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)
df_city.head()

Unnamed: 0,MUN,Count
826,66023,330376
287,23027,83509
1037,81017,53184
763,58227,52796
824,65005,48613


In [None]:
#Create dataframe of just municipality 66023
df_MUN_scenario = pd.read_csv(path)
df_MUN_scenario.drop_duplicates(keep = 'first', inplace = True)
#Create dataframe of just municipality 66023

df_MUN66023 = df_MUN_scenario.loc[df['MUN'] == 66023]

print(df_MUN66023['Naissance'].max())
print('Average year of birth in the offender network of Municipality 66023',df_MUN66023['Naissance'].mean())
df_MUN66023.sort_values('Naissance', ascending=False)


In [None]:
#Creating edgelist of offenders just in the Municipality 66023
#Creating data frame of just the Offender ID and Crime Event Case Number
df_NoUnique_i_MUN = df_MUN66023[['NoUnique','SeqE']]
df_ij_MUN = pd.merge(df_NoUnique_i_MUN, df_NoUnique_i_MUN, on='SeqE')
#Removing all rows where the offender ID number in both columns is the same
#Removing all rows where it is justa duplicate of another row with the same case number and offenders
df_ij_filteredMUN = df_ij_MUN.loc[(df_ij_MUN['NoUnique_x'] < df_ij_MUN['NoUnique_y'])]

#Determining the size of this new filtered dataframe that consists of pairs of offenders and their 
#common Crime Event case number
print(df_ij_filteredMUN.shape)
#Counting how many unique crime event case numbers are included in this data set
print(df_ij_filteredMUN.nunique())
#Counting how many unique offender IDs are included in this data set
ijcountMUN = pd.unique(df_ij_filteredMUN[['NoUnique_x', 'NoUnique_y']].values.ravel())
print(len(ijcountMUN))
#Generate a column that includes the weights for the edges
#i.e. count how many crime event case numbers any given pair of offenders has in common
edgelistMUN = df_ij_filteredMUN.value_counts(subset=['NoUnique_x','NoUnique_y'])

# converting to df and assigning new names to the columns
edgelist_wMUN = pd.DataFrame(edgelistMUN)
edgelist_wMUN = edgelist_wMUN.reset_index()
edgelist_wMUN.columns = ['NoUnique_x', 'NoUnique_y', 'weights'] # change column names


print(edgelist_wMUN.head())
print(edgelist_wMUN.shape)

In [None]:
#Creating network graph of co-offender network only in Municipality 66023
GG_MUN66023 = nx.from_pandas_edgelist(edgelist_wMUN, source='NoUnique_x',target='NoUnique_y', edge_attr='weights')
#Number of connected components in the graph
nx.number_connected_components(GG_MUN66023)
#Determining number of nodes and edges
print('Number of nodes', GG_MUN66023.order())
print('Number of edges', GG_MUN66023.size())

densityMUN = nx.density(GG_MUN66023) 
print('The edge density is: ' + str(densityMUN))

In [None]:
degree_sequenceMUN = sorted([d for n, d in GG_MUN66023.degree()], reverse=True)
dmax = max(degree_sequenceMUN)
plt.hist((degree_sequenceMUN), bins=100,edgecolor="black", color="blue")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.ylabel('Frequency')
plt.xlabel('Degree')
plt.xlim(10**0, 10**2)
plt.title('Degree Distribution of Municipality 66023 Network')

In [None]:
#Largest connected component of the Municipality 66023 graph
MUN_comp = sorted(nx.connected_components(GG_MUN66023), key=len, reverse=True)
G0 = GG_MUN66023.subgraph(MUN_comp[0])

len(MUN_comp)
print('Number of nodes in largest connected component of MUN 66023', G0.order())
print('Number of edges in largest connected component of MUN 66023', G0.size())
#Diameter of the largest connected component of Mun. 66023
nx.algorithms.distance_measures.diameter(G0)
#Edge density
densityMUN_
0 = nx.density(G0) 
print('The edge density is: ' + str(densityMUN_0))
#Calculating Centrality Measurements of the largest connect component in the Municipality  # 66023

eig_cen = nx.eigenvector_centrality(G0)
btwn_cen = nx.betweenness_centrality(G0)
deg_centrality = nx.degree_centrality(G0)


In [None]:
#Determining the node (Offender) with the greatest centrality measurement
max_btwn = max(btwn_cen, key = btwn_cen.get)
print('Node with the largest betweenness centrality is', max_btwn)
max_deg_centrality = max(deg_centrality, key = deg_centrality.get)
print('Node with the largest Degree centrality is', max_deg_centrality)
max_eig_cen = max(eig_cen, key = eig_cen.get)
print('Node with the largest Eigenvector centrality is', max_eig_cen)

#Getting the centrality measurement of the node with the greatest centrality
print(btwn_cen.get(597339))
print(deg_centrality.get(max_deg_centrality))
print(eig_cen.get(max_eig_cen))


In [None]:
df.loc[df['NoUnique'] == 597339]


In [None]:
len(df['SEXE'])
len(df['NoUnique'])
df.loc[df['SeqE'] == 23526]
Year = df.groupby(['annee'])
Year.count()

In [6]:
#Creating data frame of just the Offender ID and Crime Event Case Number
df_NoUnique_i = df[['NoUnique','SeqE']]
#Merging the above dataframe with a duplicate of itself to list all crime events and the involved offenders
df_ij = pd.merge(df_NoUnique_i, df_NoUnique_i, on='SeqE')

In [None]:
df_ij.shape

In [7]:
#Removing all rows where the offender ID number in both columns is the same
#Removing all rows where it is justa duplicate of another row with the same case number and offenders
df_ij_filtered = df_ij.loc[(df_ij['NoUnique_x'] < df_ij['NoUnique_y'])]

In [8]:
#Determining the size of this new filtered dataframe that consists of pairs of offenders and their common Crime Event case number
print(df_ij_filtered.shape)
#Counting how many unique crime event case numbers are included in this data set
print(df_ij_filtered.nunique())
#Counting how many unique offender IDs are included in this data set
ijcount = pd.unique(df_ij_filtered[['NoUnique_x', 'NoUnique_y']].values.ravel())
print(len(ijcount))


(216705, 3)
NoUnique_x    75308
SeqE          84038
NoUnique_y    74769
dtype: int64
121159


In [9]:
#Generate a column that includes the weights for the edges
#i.e. count how many crime event case numbers any given pair of offenders has in common
edgelist = df_ij_filtered.value_counts(subset=['NoUnique_x','NoUnique_y'])

# converting to df and assigning new names to the columns
edgelist_w = pd.DataFrame(edgelist)
edgelist_w = edgelist_w.reset_index()
edgelist_w.columns = ['NoUnique_x', 'NoUnique_y', 'weights'] # change column names


print(edgelist_w.head())
print(edgelist_w.shape)

   NoUnique_x  NoUnique_y  weights
0      253577      440431      356
1      614546      623487      204
2      303644      318895      106
3      207865      253979       95
4      170099      317918       80
(178413, 3)


In [10]:
GG = nx.from_pandas_edgelist(edgelist_w, source='NoUnique_x',target='NoUnique_y', edge_attr='weights')

In [11]:
nx.is_directed(GG)

False

In [None]:
degree_sequence = sorted([d for n, d in GG.degree()], reverse=True)
dmax = max(degree_sequence)


In [None]:
df_deg = pd.DataFrame(deg)
df_cs = pd.DataFrame(cs)
df_degreefreq = pd.concat([df_deg,df_cs], axis=1)
df_degreefreq.columns = ['Freq','Deg']
df_degreefreq.head()

#Generating data frame of values to preform lienar regression on
df_degreefreq['Deg']=np.log(df_degreefreq['Deg'])
df_degreefreq['Freq']=np.log(df_degreefreq['Freq'])

df_subsetdegfreq = df_degreefreq.loc[df_degreefreq['Deg'] <= 10**2]

df_degreefreq.head()

In [None]:
#Linear regression on Log of Degree and Log of Frequency to determine linear relationship
m, b = np.polyfit(df_subsetdegfreq['Deg'], df_subsetdegfreq['Freq'], 1)
print('Slope:'m)
print('Intercept:'b)

In [None]:
#Plotting lienar regression line
sns.regplot(x='Deg', y='Freq', data=df_subsetdegfreq)
plt.title("Cumulative Distribution plot of Entire Network")
plt.ylabel("log(Cumulative Frequency(Fraction of Nodes having degree k or greater))")
plt.xlabel("log(Degree k)")


In [None]:
w = np.log(list(deg))
dd = w[0:10**2]
d = np.linspace(10**0, 10**2, len(dd))
y_hat = d*m+b


In [None]:
plt.scatter(d, )
plt.plot(d, y_hat)
plt.xlim(10**0, 10**2)


In [None]:
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
cs = np.cumsum(cnt)
plt.loglog(deg, cs, 'bo')
plt.title("Cumulative Distribution plot of Entire Network")
plt.ylabel("Cumulative Frequency(Fraction of Nodes having degree k or greater)")
plt.xlabel("Degree k")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.xlim(10**0, 10**2)

plt.show()

In [None]:
degree_freq = nx.degree_histogram(GG)
degrees = range(len(degree_freq))
plt.figure(figsize=(12, 8)) 
plt.loglog(degrees[m:], degree_freq[m:],'go-') 
plt.xlabel('Degree')
plt.ylabel('Frequency')

In [None]:
plt.hist((degree_sequence), bins=100,edgecolor="black", color="blue")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.ylabel('Frequency')
plt.xlabel('Degree')
plt.xlim(10**0, 10**2)
plt.title('Degree Distribution of Network')


In [None]:
binwidth = 15
plt.hist(degree_sequence, bins=range(min(degree_sequence), max(degree_sequence) + binwidth, binwidth),
         edgecolor="black", color="blue")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.ylabel('Frequency')
plt.xlabel('Degree') 

In [None]:
GG.order()

In [None]:
GG.size()

In [None]:
nx.number_connected_components(GG)

In [None]:
largest_cc = max(nx.connected_components(GG), key=len)

Large_G = GG.subgraph(largest_cc).copy()

In [None]:
print('Number of nodes', Large_G.order())

In [None]:
m= 1
degree_freq = nx.degree_histogram(GG)
degrees = range(len(degree_freq))
plt.figure(figsize=(12, 8)) 
plt.loglog(degrees[m:], degree_freq[m:],'go-') 
plt.xlabel('Degree')
plt.ylabel('Frequency')

In [None]:
degree_sequenceL = sorted([d for n, d in Large_G.degree()], reverse=True)

plt.hist((degree_sequenceL), bins=50,edgecolor="black", color="blue")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.ylabel('Frequency')
plt.xlabel('Degree')
plt.xlim(10**0, 10**2)
plt.title('Degree Distribution of Largest Connected Network')

In [None]:
degreeCount = collections.Counter(degree_sequenceL)
deg, cnt = zip(*degreeCount.items())
cs = np.cumsum(cnt)
plt.loglog(deg, cs, 'bo')
plt.title("Cumulative Distribution plot of Largest Connected Network")
plt.ylabel("Cumulative Frequency(Fraction of Nodes having degree k or greater)")
plt.xlabel("Degree k")
plt.xscale('log')                                                                                                                
plt.yscale('log')
plt.xlim(10**0, 10**2)
plt.show()

In [None]:
Large_G.size()

In [None]:
density = nx.density(Large_G) 
print('The edge density of the largest connected component is: ' + str(density))
print('Diameter of largest connected component is:' nx.algorithms.distance_measures.diameter(Large_G))
print('Clustering coefficient of largest connected component', nx.average_clustering(Large_G))

In [None]:
density = nx.density(GG) 
print('The edge density of the entire network is: ' + str(density))

In [12]:
#Created function to convert genders to 0 and 1
def label_sex(df):
    if df['SEXE'] == 'F' :
         return 1
    elif df['SEXE'] == 'M' :
          return 0

In [13]:
#Creating data frame of gender assignment and Offender (NoUnique)
df_Gender = pd.read_csv(path)
df_Gender.drop(df_Gender.columns[[1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]], axis = 1, inplace = True)
#Applying function to convert M and F designation to 0 and 1
df_Gender['SEXE'] = df_Gender.apply(label_sex, axis=1)

df_Gender.set_index(df_Gender['NoUnique'], inplace=True)
df_Gender.drop(df_Gender.columns[[0]], axis = 1, inplace = True)
df_Gender.head()
#Converting dataframe into dictionary
dict_Gender = df_Gender.to_dict()


In [14]:
#Adding attributes to inital network
nx.set_node_attributes(GG, dict_Gender, 'gender')
#Designating new largest connected component network but with gender attributes
largest_cc_g = max(nx.connected_components(GG), key=len)

Large_G_gender = GG.subgraph(largest_cc_g).copy()

In [15]:
#Calculating modularity on this largest connected component with gender designated as the similarity determination
nx_comm.modularity(Large_G_gender, nx_comm.label_propagation_communities(Large_G_gender))


0.8661153086822764

In [33]:
Bgender = nx.modularity_matrix(Large_G_gender)

In [19]:
#Calculating assortativity of largest component based on gender.
AS_Gender = nx.attribute_assortativity_coefficient(Large_G_gender, 'gender')
print(f"{AS_Gender:3.1f}")


nan


In [23]:
#Municipality Homophily
#Will generate a network with the crime events as the nodes and the offenders as edges 
#Will then assign the municipality as a node attribute of the crime event

#Creating edgelist of offenders just in the Municipality 66023
#Creating data frame of just the Offender ID and Crime Event Case Number
df_Mun_i = df[['NoUnique','SeqE']]
df_Mun_ij = pd.merge(df_Mun_i, df_Mun_i, on='NoUnique')
df_Mun_ij.head()

Unnamed: 0,NoUnique,SeqE_x,SeqE_y
0,1,1085034,1085034
1,2,1431379,1431379
2,4,167174,167174
3,5,1179096,1179096
4,17,1270690,1270690


In [42]:
df_MM = df[['NoUnique','MUN']]

for row in df_MM:
    for col in df_MM:
    

MemoryError: Unable to allocate 100. MiB for an array with shape (13126324,) and data type int64

In [24]:
#Removing all rows where the Crime Event Number in both columns is the same
#Removing all rows where it is just a duplicate of another row with the same case number and offenders
df_ij_filteredMUN = df_Mun_ij.loc[(df_Mun_ij['SeqE_x'] < df_Mun_ij['SeqE_y'])]

#Determining the size of this new filtered dataframe that consists of pairs of crime events 
#and their common offenders
print(df_ij_filteredMUN.shape)
#Counting how many unique crime event case numbers and offenders are included in this data set
print(df_ij_filteredMUN.nunique())
#Counting how many unique crime events are included in this data set
ijcountMUN = pd.unique(df_ij_filteredMUN[['SeqE_x', 'SeqE_y']].values.ravel())
print(len(ijcountMUN))
#Generate a column that includes the weights for the edges
#i.e. count how many common offenders that any two pairs of crime events has in common
edgelistMUN = df_ij_filteredMUN.value_counts(subset=['SeqE_x', 'SeqE_y'])

# converting to df and assigning new names to the columns
edgelist_wMUN = pd.DataFrame(edgelistMUN)
edgelist_wMUN = edgelist_wMUN.reset_index()
edgelist_wMUN.columns = ['SeqE_x', 'SeqE_y', 'weights'] # change column names


print(edgelist_wMUN.head())
print(edgelist_wMUN.shape)

(5923161, 3)
NoUnique    184475
SeqE_x      687081
SeqE_y      690071
dtype: int64
854680
   SeqE_x  SeqE_y  weights
0   27849  872952       30
1   27849  872932       24
2  423073  423093       14
3  423069  423071       14
4  423069  423073       14
(5705200, 3)


In [26]:
GG_mun = nx.from_pandas_edgelist(edgelist_wMUN, source='SeqE_x',target='SeqE_y', edge_attr='weights')


In [28]:
df_muni = pd.read_csv(path)
df_muni.head()

Unnamed: 0,NoUnique,Naissance,SEXE,SeqE,dateInf,NCD1,NCD2,NCD3,NCD4,MUN,ED1,Jeunes,Adultes,Date,annee
0,1,1007,F,1085034,20051217,3530,,,,58227,2.0,0,1,12/17/2005,2005
1,2,1828,F,1431379,20080423,1430,,,,94068,5.0,0,1,04/23/2008,2008
2,4,1889,M,167174,20080306,1430,21702.0,,,49058,2.0,0,1,03/06/2008,2008
3,5,1892,M,1179096,20080821,1420,,,,65005,71.0,0,1,08/21/2008,2008
4,17,1897,M,1270690,20030430,1625,,,,23027,,0,3,04/30/2003,2003


In [29]:
#Creating data frame of municipality and crime event (SeqE)
df_muni.drop(df_muni.columns[[0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14]], axis = 1, inplace = True)


df_muni.set_index(df_muni['SeqE'], inplace=True)
df_muni.drop(df_muni.columns[[0]], axis = 1, inplace = True)
print(df_muni.head())
#Converting dataframe into dictionary
dict_muni = df_muni.to_dict()

           MUN
SeqE          
1085034  58227
1431379  94068
167174   49058
1179096  65005
1270690  23027


In [30]:
#Adding municipality as a node attribute to the network, the nodes are in this case Crime Events
nx.set_node_attributes(GG_mun, dict_muni, 'MUN')

largest_cc_mun = max(nx.connected_components(GG_mun), key=len)

Large_G_mun = GG_mun.subgraph(largest_cc_mun).copy()
print('Number of nodes', Large_G_mun.order())


Number of nodes 120603


In [31]:
nx_comm.modularity(Large_G_mun, nx_comm.label_propagation_communities(Large_G_mun))


0.951118897680136

In [39]:
Bgender.dtype

dtype('float64')

In [41]:
Matrix = edgelist_wMUN.transpose()

MemoryError: Unable to allocate 131. MiB for an array with shape (3, 5705200) and data type int64

In [40]:
ax = sns.heatmap(edgelist_wMUN)


MemoryError: Unable to allocate 131. MiB for an array with shape (3, 5705200) and data type int64