In [58]:
"""

    * Pandas is a software library written for the Python programming language for data manipulation and analysis.
    * Matplotlib & seaborn are plotting libraries.
    * NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays 
      and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
    * csv is a library which helps to process csv files.
    * collections is a library which supports some external data structures.

"""

# Importing libraries for analyzing the annotated SATP news reports

import pandas as pd
import numpy as np
import csv
import collections


In [59]:
# Defining an array which holds the names of the components of the events

EVENTS = ['target', 'source', 'action', 'location', 'date', 'action_type']

In [60]:
"""

    This function processes the string to arraytype for each row and also fills up the null values
    
    INPUT - df - Dataframe
            col - Name of the column. The column is a string type column
    

"""
def process_column(df, col):
    df.fillna("NotSpecified", inplace=True)
    df[col] = df[col].str.strip('[]').str.split(',')
    #df[col] = df[col].apply(my_function)
    
"""

    This function processes the text and gets rid off the extra spaces within the labels
    
    Input - Array of Strings
    
    Example - Input - ['  ArmedAssault', 'ArmedAssault  ']
              Output - ['ArmedAssault', 'ArmedAssault']
    
    

"""
def proceess_text(data):
    for i in range(len(data)):
        data[i] = ''.join(c for c in data[i] if c not in '"')
        data[i] = ''.join(c for c in data[i] if c not in '\'')
        data[i] = data[i].strip()

"""

    This function retrives the distinct labels from each row of a particular column
    
    INPUT - Column Values
    
    Example - Input - ['ArmedAssault', 'ArmedAssault']
              Output - ['ArmedAssault']

"""      
        
def get_unique(data):
    for i in range(len(data)):
        # print(data[i])
        proceess_text(data[i])
        data[i] = list(set(data[i]))
    
    
    

In [61]:
import itertools
from collections import defaultdict

"""
    
    This function takes an nested list of strings. After that it flattens the entire array. For example,
    [['kidnapped', 'killed'], ['injured']] becomes ['kidnapped', 'killed','injured']. The reason for doing
    this is - for analysis purpose, we dont want any duplicacy. Thats why getting rid off the redudency. 
    
    This function also can produce the Top K labels of given column with its respective counts.

"""

def flatten_list(data, K):
    # get_unique(data)
    flatten = list(itertools.chain.from_iterable(data))
    #proceess_text(flatten)
    records = defaultdict(int)
    track = []
    for i in flatten:
        records[i] += 1
    for k, v in records.items():
        track.append((v, k))
    track.sort(key = lambda x : [-x[0]])
    if K > 0:
        track = track[0:K]
    keep = set()
    for i in track:
        keep.add(i[1])
    final_list = []
    for i in flatten:
        if i in keep:
            final_list.append(i)
    return final_list

In [62]:
# Importing data from csv file

############## First Round Data ##################
p1 = pd.read_csv('Data/p1.csv', header=[0])
p2 = pd.read_csv('Data/p2.csv', header=[0])
p3 = pd.read_csv('Data/p3.csv', header=[0])

############## Second Round Data ##################
p4 = pd.read_csv('Data/p4.csv', header=[0])
p5 = pd.read_csv('Data/p5.csv', header=[0])
p6 = pd.read_csv('Data/p6.csv', header=[0])

############## Third Round Data ##################
p7 = pd.read_csv('Data/p7.csv', header=[0])
p8 = pd.read_csv('Data/p8.csv', header=[0])
p9 = pd.read_csv('Data/p9.csv', header=[0])

df = pd.concat([p1, p2, p3, p4, p5, p6, p7, p8, p9], ignore_index=True)

# process - masteer
for i in EVENTS:
    process_column(df, i)
    get_unique(df[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [63]:
df.head()

Unnamed: 0,id,is_relevant,target,source,action,action_type,location,date,country,year,news
0,32355,True,"[Rangers, Military Police]","[militants, Lashkar e Jhangvi LeJ]","[gunned, killed, killing]",[Armed Assault],[Karachi],[December 6],pakistan,2016,Two Lashkar e Jhangvi LeJ militants Asim alias...
1,32398,True,"[Security Forces, civilians, SFs]",[NotSpecified],"[killed, killing, fighting, wounded]",[Unknown],[NotSpecified],[December 20],pakistan,2016,The Commander Southern Command Lieutenant Gene...
2,29171,True,"[Nagas, Naga]","[MNPF, Manipur Naga Peopleâ€™s Front]","[attack, bomb, explosion, exploded]",[Bombing/Explosion],[NotSpecified],[August 30],india,2016,Meanwhile the underground organization Manipur...
3,25445,True,"[Artisan Restaurant, hostages, foreigners]","[suicide fighters, suicide soldiers, attackers]","[gun battle, killing, injuring]",[Armed Assault],[Dhaka],"[27th, August]",bangladesh,2016,Islamic State IS in the latest issue of its on...
4,31739,True,"[MQM, Muttahida Qaumi Movement]",[NotSpecified],[shot],[Assassination],"[Karachi, Naeemabad, Sindh, Korangi]",[June 19],pakistan,2016,A senior Muttahida Qaumi Movement MQM worker i...


## Unique List of Actions

In [64]:
action_stem = np.sort(df['action'].explode().unique())
action = pd.DataFrame({'action':action_stem})
action.shape
# action.to_csv('action.csv', index = False, header = False)

(227, 1)

In [65]:
actions = pd.read_csv('Data/action.csv')
actions['Original'] = actions.iloc[:, 0]
actions['GroupName'] = actions.iloc[:, 1]
actions = actions.drop_duplicates('Original')
actions.shape

(192, 4)

In [33]:
def get_new_events(previous_event, total_event):
    old_list = set()
    new_list = []
    for i in previous_event:
        old_list.add(i)
    for j in total_event:
        if j not in old_list:
            new_list.append(j)
    return new_list


# def get_list(prev, new):
#     present = set()
#     not_present = []
#     for i in prev:
#         present.add(i)
#     for j in new:
#         if j not in present:
#             not_present.append(j)
#     return not_present

# newList = get_list(actions['Original'], action['action'])
# print(newList)

## Unique List of Targets

In [78]:
targets = pd.read_csv('Data/target.csv')
targets
targets['Original'] = targets.iloc[:, 0]
targets['GroupName'] = targets.iloc[:, 1]
# targets = targets.drop([0, 1], axis=1)
targets = targets.drop_duplicates('Original')

In [79]:
target_stem = (df['target'].explode().unique())
target = pd.DataFrame({'target':target_stem})
# target.to_csv('target.csv', index = False, header = False)

In [80]:
newList = get_new_events(targets['Original'], target['target'])
# val = [" " for i in range(len(newList))]
new_df = pd.DataFrame({'Original':newList})
targets = pd.concat([targets, new_df], ignore_index=True)
targets = targets.loc[:, ['Original', 'GroupName']]
# targets.to_csv('target.csv', index = False, header = False)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


## Unique List of Source

In [68]:
source = pd.read_csv('Data/source.csv')
source['Original'] =  source.iloc[:, 0]
source['GroupName'] = source.iloc[:, 1]
source = source.drop_duplicates('Original')

In [69]:
source_stem = np.sort(df['source'].explode().unique())
sources = pd.DataFrame({'source':source_stem})
# source.to_csv('source.csv', index = False, header = False)

In [70]:
newList = get_new_events(source['Original'], sources['source'])
# val = [" " for i in range(len(newList))]
new_df = pd.DataFrame({'Original':newList})
source = pd.concat([source, new_df], ignore_index=True)
source  = source .loc[:, ['Original', 'GroupName']]
# source.to_csv('source.csv', index = False, header = False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


# Group the Source & Target in Master Data

In [71]:
action_map = {}
target_map = {}
source_map = {}

def create_map(key, val, event_map):
    key = key.strip()
    event_map[key] = val
for index, row in targets.iterrows():
    create_map(row['Original'], row['GroupName'], target_map)
for index, row in actions.iterrows():
    create_map(row['Original'], row['GroupName'], action_map)
for index, row in source.iterrows():
    create_map(row['Original'], row['GroupName'], source_map)

In [72]:
def target_group_name(row):
    group_list = []
    for i in range(len(row)):
        if row[i] not in target_map.keys():
            group_list.append(row[i])
        else:
            group_list.append(target_map[row[i]])
    return group_list
df['target_group'] = df['target'].apply(target_group_name)
df.head(5)

Unnamed: 0,id,is_relevant,target,source,action,action_type,location,date,country,year,news,target_group
0,32355,True,"[Rangers, Military Police]","[militants, Lashkar e Jhangvi LeJ]","[gunned, killed, killing]",[Armed Assault],[Karachi],[December 6],pakistan,2016,Two Lashkar e Jhangvi LeJ militants Asim alias...,"[Military, Military]"
1,32398,True,"[Security Forces, civilians, SFs]",[NotSpecified],"[killed, killing, fighting, wounded]",[Unknown],[NotSpecified],[December 20],pakistan,2016,The Commander Southern Command Lieutenant Gene...,"[Military, Private Citizen and Property, Milit..."
2,29171,True,"[Nagas, Naga]","[MNPF, Manipur Naga Peopleâ€™s Front]","[attack, bomb, explosion, exploded]",[Bombing/Explosion],[NotSpecified],[August 30],india,2016,Meanwhile the underground organization Manipur...,"[Naga National Council, Naga National Council]"
3,25445,True,"[Artisan Restaurant, hostages, foreigners]","[suicide fighters, suicide soldiers, attackers]","[gun battle, killing, injuring]",[Armed Assault],[Dhaka],"[27th, August]",bangladesh,2016,Islamic State IS in the latest issue of its on...,"[Business, Private Citizen and Property, Priva..."
4,31739,True,"[MQM, Muttahida Qaumi Movement]",[NotSpecified],[shot],[Assassination],"[Karachi, Naeemabad, Sindh, Korangi]",[June 19],pakistan,2016,A senior Muttahida Qaumi Movement MQM worker i...,"[Muttahida Qaumi Movement, Muttahida Qaumi Mov..."


In [73]:
def action_group_name(row):
    group_list = []
    for i in range(len(row)):
        if row[i] not in action_map.keys():
            group_list.append(row[i])
        else:
            group_list.append(action_map[row[i]])
    return group_list
df['action_group'] = df['action'].apply(action_group_name)
df.head(5)

Unnamed: 0,id,is_relevant,target,source,action,action_type,location,date,country,year,news,target_group,action_group
0,32355,True,"[Rangers, Military Police]","[militants, Lashkar e Jhangvi LeJ]","[gunned, killed, killing]",[Armed Assault],[Karachi],[December 6],pakistan,2016,Two Lashkar e Jhangvi LeJ militants Asim alias...,"[Military, Military]","[gun attack, killed, killed]"
1,32398,True,"[Security Forces, civilians, SFs]",[NotSpecified],"[killed, killing, fighting, wounded]",[Unknown],[NotSpecified],[December 20],pakistan,2016,The Commander Southern Command Lieutenant Gene...,"[Military, Private Citizen and Property, Milit...","[killed, killed, fighting, wounded]"
2,29171,True,"[Nagas, Naga]","[MNPF, Manipur Naga Peopleâ€™s Front]","[attack, bomb, explosion, exploded]",[Bombing/Explosion],[NotSpecified],[August 30],india,2016,Meanwhile the underground organization Manipur...,"[Naga National Council, Naga National Council]","[attack, bomb, exploded, exploded]"
3,25445,True,"[Artisan Restaurant, hostages, foreigners]","[suicide fighters, suicide soldiers, attackers]","[gun battle, killing, injuring]",[Armed Assault],[Dhaka],"[27th, August]",bangladesh,2016,Islamic State IS in the latest issue of its on...,"[Business, Private Citizen and Property, Priva...","[gun attack, killed, injured]"
4,31739,True,"[MQM, Muttahida Qaumi Movement]",[NotSpecified],[shot],[Assassination],"[Karachi, Naeemabad, Sindh, Korangi]",[June 19],pakistan,2016,A senior Muttahida Qaumi Movement MQM worker i...,"[Muttahida Qaumi Movement, Muttahida Qaumi Mov...",[shot]


In [74]:
def source_group_name(row):
    group_list = []
    for i in range(len(row)):
        if row[i] not in source_map.keys():
            group_list.append(row[i])
        else:
            group_list.append(source_map[row[i]])
    return group_list
df['source_group'] = df['source'].apply(source_group_name)
df.head(5)

Unnamed: 0,id,is_relevant,target,source,action,action_type,location,date,country,year,news,target_group,action_group,source_group
0,32355,True,"[Rangers, Military Police]","[militants, Lashkar e Jhangvi LeJ]","[gunned, killed, killing]",[Armed Assault],[Karachi],[December 6],pakistan,2016,Two Lashkar e Jhangvi LeJ militants Asim alias...,"[Military, Military]","[gun attack, killed, killed]","[Attackers, Lashkar e Jhangvi]"
1,32398,True,"[Security Forces, civilians, SFs]",[NotSpecified],"[killed, killing, fighting, wounded]",[Unknown],[NotSpecified],[December 20],pakistan,2016,The Commander Southern Command Lieutenant Gene...,"[Military, Private Citizen and Property, Milit...","[killed, killed, fighting, wounded]",[Unknown Source]
2,29171,True,"[Nagas, Naga]","[MNPF, Manipur Naga Peopleâ€™s Front]","[attack, bomb, explosion, exploded]",[Bombing/Explosion],[NotSpecified],[August 30],india,2016,Meanwhile the underground organization Manipur...,"[Naga National Council, Naga National Council]","[attack, bomb, exploded, exploded]","[Manipur Naga Peoples Front, Manipur Naga Peop..."
3,25445,True,"[Artisan Restaurant, hostages, foreigners]","[suicide fighters, suicide soldiers, attackers]","[gun battle, killing, injuring]",[Armed Assault],[Dhaka],"[27th, August]",bangladesh,2016,Islamic State IS in the latest issue of its on...,"[Business, Private Citizen and Property, Priva...","[gun attack, killed, injured]","[Attackers, Attackers, Attackers]"
4,31739,True,"[MQM, Muttahida Qaumi Movement]",[NotSpecified],[shot],[Assassination],"[Karachi, Naeemabad, Sindh, Korangi]",[June 19],pakistan,2016,A senior Muttahida Qaumi Movement MQM worker i...,"[Muttahida Qaumi Movement, Muttahida Qaumi Mov...",[shot],[Unknown Source]


In [82]:
df.to_csv('master.csv', index = False, header = True)

In [83]:
df.shape

(4500, 14)

# Other Codes

In [81]:
# action_stem = np.sort(df['source'].explode().unique())
# action = pd.DataFrame({'source':action_stem})
# action.shape
# action.to_csv('source.csv', index = False, header = False)

# key = []
# val = []
# for k, v in source_map.items():
#     key.append(k)
#     val.append(v)

# sd = pd.DataFrame({'k':key, 'v':val})
# sd.to_csv('source.csv', index = False, header = False)

# action_stem = np.sort(df['target'].explode().unique())
# action = pd.DataFrame({'target':action_stem})
# action.to_csv('target.csv', index = False, header = False)
# action.shape