In [23]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import pandas as pd
from geopy.distance import vincenty
%matplotlib inline

# Initialisation Data frame
### Création d'un data frame vide pour y ajouter ensuite les clusters de chaque zone

In [24]:
dfinal=pd.DataFrame(columns=('BarLong','BarLat','Rayon','Weight','NbDays','Month','Year','Id'))
dfinal.loc[0] =[0,0,0,0,0,0,0,0]

# Lecture des données
### Les données sont séparées en deux data frame par zone : un contenant toute les attaques de la zone et un contenant le reste des attaques

In [25]:
df = pd.read_csv('../data/aden.csv',sep=';')
df2 = df[df["large_area"]!="G"]

dfA = df[df["large_area"]=="A"]
dfA2 = df[df["large_area"]!="A"]

dfI = df[df["large_area"]=="I"]
dfI2 = df[df["large_area"]!="I"]

df=df[df["large_area"]=="G"]

size = len(df)

# Fonctions communes aux trois zones

In [26]:
#inscription du mois au même format qu'il soit composé d'un ou de deux chiffres
def monthCorrect(month):
    month=str(int(month))
    if len(month)==1:
        return '0'+month
    else:
        return month

In [27]:
#construction du dendrogramme
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (with max distance)')
        plt.xlabel('sample index')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [28]:
#construction des clusters
def clusterise(date, max_d, df,zone):
    #sous ensemble des données correspondant à la date choisie
    clustersId = []
    subDf = df[df["date"]==date]
    subDate = date.split("-")
    #dans le cas où le cluster ne contient qu'un seul élément :
    if(len(subDf) == 1):
        maxIdx=dfinal.index.values.max() 
        dfinal.loc[maxIdx+1] =[subDf["longitude"].values[0], subDf["latitude"].values[0],0,1,1,subDate[1],subDate[0],date+'_'+str(1)+'_'+zone]
        df["id_cluster"].loc[subDf.index.values[0]]=date+'_'+str(1)+'_'+zone
        return dfinal
    
    #construction du dataframe 2D
    X = subDf[["longitude","latitude"]].values
    #clusterisation
    Z = linkage(X, 'ward', metric = 'euclidean')
    #affichage du dendrogram

    #fancy_dendrogram(
    #    Z,
    #    truncate_mode='lastp',
    #    p=12,
    #    leaf_rotation=90.,
    #    leaf_font_size=12.,
    #    show_contracted=True,
    #    max_d=max_d,  # arret du dendrogram
    #)
    #plt.show()
    
    #récuperation des clusters
    from scipy.cluster.hierarchy import fcluster
    clusters = fcluster(Z, max_d, criterion='distance')
      
    for p in range(0,len(subDf.index.values)):
        df["id_cluster"].loc[subDf.index.values[p]]=date+'_'+str(clusters[p])+'_'+zone
        #print subDf.index.values[clusters==1]
        #print subDf["jour (0)"][subDf.index.values[clusters==1]]
    
    #calcul du barycentre de chaque cluster
    centroidX = []
    centroidY = []
    for i in range(min(clusters),max(clusters)+1):
        x = np.mean(X[clusters==i,0])
        y = np.mean(X[clusters==i,1])
        point = (x,y)
        dist = []
        for j in range(0,len(X[clusters==i,0])):
            point2=(X[clusters==i,0][j], X[clusters==i,1][j])
            dist.append(vincenty(point, point2).kilometers)
        centroidX.append(x)
        centroidY.append(y)

        maxJ = max(subDf["jour (0)"][subDf.index.values[clusters==i]])
        minJ = min(subDf["jour (0)"][subDf.index.values[clusters==i]])
        
        nbDay = maxJ - minJ        
        maxIdx=dfinal.index.values.max() 
        dfinal.loc[maxIdx+1] =[x, y, max(dist),len(clusters[clusters==i]),nbDay,subDate[1], subDate[0],date+'_'+str(i)+'_'+zone]

    #affichage des clusters
    #plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
    #affichage des barycentres
    #plt.scatter(centroidX,centroidY,cmap='prism')
    #plt.title('Clusters and their centroid')
    #plt.xlabel('Longitude')
    #plt.ylabel('Latitude')
    #plt.show()

## Zone G

In [29]:
# récupération des dates
df["mois (0)"]=df.apply(lambda row: monthCorrect(row['mois (0)']), axis=1)
df["date"]=df.apply(lambda row: str(row["Annee (0)"]) +"-"+ str(row["mois (0)"]), axis=1)

In [30]:
df["date"].unique()

array(['2010-10', '2010-01', '2010-02', '2010-03', '2010-05', '2010-06',
       '2010-07', '2010-08', '2010-09', '2010-11', '2010-04', '2010-12',
       '2008-02', '2008-03', '2008-04', '2008-05', '2008-07', '2008-08',
       '2008-09', '2008-10', '2008-11', '2008-12', '2008-06', '2009-01',
       '2009-02', '2009-03', '2009-04', '2009-05', '2009-07', '2009-12',
       '2009-06', '2009-08', '2009-09', '2009-10', '2009-11', '2011-01',
       '2011-04', '2011-09', '2011-02', '2011-05', '2011-07', '2011-08',
       '2011-10', '2011-03', '2011-11', '2011-06', '2011-12', '2012-02',
       '2012-12', '2012-03', '2012-04', '2012-01', '2012-06', '2012-05',
       '2012-07', '2013-03', '2013-06', '2013-05', '2013-07', '2013-12',
       '2014-01', '2014-04', '2014-02', '2014-03', '2014-08', '2014-12'], dtype=object)

In [31]:
# génération des clusters
df["id_cluster"] = ""
for i in range(0,len(df["date"].unique())):
    clusterise(df["date"].unique()[i], 4.5,df,"G")

In [32]:
dfinal.head()

Unnamed: 0,BarLong,BarLat,Rayon,Weight,NbDays,Month,Year,Id
0,0.0,0.0,0.0,0,0,0,0,0
1,49.162986,13.191875,47.973162,4,25,10,2010,2010-10_1_G
2,55.191667,13.616667,128.60897,2,2,10,2010,2010-10_2_G
3,47.850833,12.711389,96.305994,3,16,1,2010,2010-01_1_G
4,47.908333,13.033333,93.623201,2,2,2,2010,2010-02_1_G


In [33]:
frames = [df, df2]

result = pd.concat(frames)
print result.head()
result.to_csv("../data/aden.csv",sep=';', index=False)

     Kidnapping   meurtre  Annee (0)  Guns  Knives   Nombre P  Min  \
23            0         0       2010     0        0             -1   
25            0         0       2010     0        0             -1   
42            1         0       2010     0        0             -1   
43            1         0       2010     0        0             -1   
44            0         0       2010     1        0              6   

    Nombre P  max  RPG  Vol �� bord  bless�es ...  mousson  \
23             -1    0            0         0 ...        0   
25             -1    0            0         0 ...        0   
42             -1    0            0         0 ...        0   
43             -1    0            0         1 ...        0   
44              6    0            0         0 ...        0   

                                        newDescriptif         new_type  \
23                           pirates hijacked a dhow.            Other   
25  armed pirates attacked and boarded the ship un...  Veh

## Zone A

In [34]:
dfA["mois (0)"]=dfA.apply(lambda row: monthCorrect(row['mois (0)']), axis=1)
dfA["date"]=dfA.apply(lambda row: str(row["Annee (0)"]) +"-"+ str(row["mois (0)"]), axis=1)
dfA["date"].unique()
print dfA["date"].unique()

['2010-01' '2010-03' '2010-04' '2010-05' '2010-10' '2010-11' '2010-12'
 '2010-06' '2010-02' '2008-02' '2008-03' '2008-04' '2009-12' '2009-06'
 '2009-04' '2009-07' '2009-09' '2009-10' '2011-03' '2011-01' '2011-02'
 '2011-04' '2011-05' '2011-10' '2011-12' '2011-06' '2011-09' '2011-11'
 '2012-02' '2012-03' '2012-04' '2012-05' '2012-06' '2012-12' '2012-01'
 '2013-01' '2013-06' '2013-07' '2013-10' '2013-11' '2013-12' '2014-07'
 '2014-09' '2014-10' '2014-03']


In [35]:
dfA["id_cluster"] = ""
for i in range(0,len(dfA["date"].unique())):
    clusterise(dfA["date"].unique()[i], 6.9,dfA,"A")

In [36]:
frames = [dfA, dfA2]

result = pd.concat(frames)
print result.head()

    Kidnapping   meurtre  Annee (0)  Guns  Knives   Nombre P  Min  \
0            1         0       2010     0        0             -1   
3            1         0       2010     0        0             -1   
4            1         0       2010     0        0             -1   
5            1         0       2010     0        0             -1   
7            0         0       2010     0        0             -1   

   Nombre P  max  RPG  Vol �� bord  bless�es ...  mousson  \
0             -1    0            0         0 ...        0   
3             -1    0            0         0 ...        0   
4             -1    0            0         0 ...        0   
5             -1    0            0         0 ...        0   
7             -1    0            0         0 ...        0   

                                       newDescriptif         new_type  \
0  pirates hijacked a ship underway and took her ...  Vehicle Carrier   
3  armed pirates attacked, boarded and hijacked t...  Vehicle Carrier   

## Zone I

In [37]:
dfI["mois (0)"]=dfI.apply(lambda row: monthCorrect(row['mois (0)']), axis=1)
dfI["date"]=dfI.apply(lambda row: str(row["Annee (0)"]) +"-"+ str(row["mois (0)"]), axis=1)
dfI["date"].unique()

array(['2010-03', '2010-04', '2010-05', '2010-09', '2010-10', '2010-11',
       '2010-12', '2010-08', '2008-08', '2008-09', '2008-04', '2008-05',
       '2008-11', '2008-02', '2008-03', '2008-07', '2008-10', '2008-12',
       '2009-05', '2009-12', '2009-03', '2009-04', '2009-10', '2009-11',
       '2009-01', '2009-02', '2009-08', '2009-09', '2011-03', '2011-05',
       '2011-01', '2011-04', '2011-06', '2011-09', '2011-10', '2011-11',
       '2011-07', '2011-08', '2011-02', '2012-11', '2012-03', '2012-05',
       '2012-10', '2012-02', '2012-01', '2013-02', '2013-03', '2013-05',
       '2013-01', '2013-04', '2013-10', '2013-11', '2014-03', '2014-01',
       '2014-04', '2014-02'], dtype=object)

In [38]:
dfI["id_cluster"] = ""
for i in range(0,len(dfI["date"].unique())):
    clusterise(dfI["date"].unique()[i], 7.6, dfI,"I")

In [39]:
frames = [dfI, dfI2]

result = pd.concat(frames)
print result.head()

    Kidnapping   meurtre  Annee (0)  Guns  Knives   Nombre P  Min  \
1            1         0       2010     1        0             11   
2            1         0       2010     0        0             -1   
6            1         0       2010     0        0             -1   
8            0         0       2010     1        0             -1   
9            0         0       2010     0        0             -1   

   Nombre P  max  RPG  Vol �� bord  bless�es ...  mousson  \
1             11    1            1         1 ...        0   
2             -1    0            0         0 ...        0   
6             -1    0            0         0 ...        0   
8             -1    1            0         0 ...        0   
9             -1    0            0         0 ...        0   

                                       newDescriptif         new_type  \
1  eleven pirates in three skiffs armed with guns...           Vessel   
2  armed pirates attacked and hijacked a tanker u...           Tanker   

# Inscription dans le fichier

In [40]:
dfinal.to_csv("../data/aden_golfe_matrix_cluster.csv",sep=';', encoding='utf-8', index=False)