In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

In [5]:
NER = spacy.load('en_core_web_sm')

## Load Book

In [20]:
import os

book = open("data/MartinLings-Muhammad.txt", "r", encoding="utf8").read()
book_doc = NER(book)

In [28]:
# visualize identified entities
displacy.render(book_doc[34000:36000], style="ent", jupyter=True)

### Scholars Name

In [31]:
name_1 = pd.read_csv('data/character/Recipe_ori__242116643_1.csv')
name_2 = pd.read_csv('data/character/Recipe_ori__242116643_2.csv')
name_3 = pd.read_csv('data/character/Recipe_ori__242116643_3.csv')

In [38]:
scholars_name = pd.concat([name_1, name_2, name_3], ignore_index=True)
scholars_name.rename(columns={'Column 1' : 'name'}, inplace=True)
scholars_name

Unnamed: 0,name
0,al-Zbrqan bin al-Harith al-Nmyry
1,al-Zbrqan bin Bashyr bin 'Amr
2,Znkl Shykh
3,Znkl bin Ala bin Mhjn
4,Zrarh bin Mus'ab al-Shny
...,...
25243,Zakaria Shykh
25244,Zakaria bin 'Umar
25245,Zufar bin Abi Kathir al-Shaymi al-Hanfi
25246,Za'id'h bin Aws al-Kndy


In [41]:
import re
scholars_name['name'] = scholars_name['name'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))
scholars_name['firstname'] = scholars_name['name'].apply(lambda x: x.split(' ',1)[0])
scholars_name

Unnamed: 0,name,firstname
0,al-Zbrqan bin al-Harith al-Nmyry,al-Zbrqan
1,al-Zbrqan bin Bashyr bin 'Amr,al-Zbrqan
2,Znkl Shykh,Znkl
3,Znkl bin Ala bin Mhjn,Znkl
4,Zrarh bin Mus'ab al-Shny,Zrarh
...,...,...
25243,Zakaria Shykh,Zakaria
25244,Zakaria bin 'Umar,Zakaria
25245,Zufar bin Abi Kathir al-Shaymi al-Hanfi,Zufar
25246,Za'id'h bin Aws al-Kndy,Za'id'h


In [23]:
sent_entity_df = []

#loop through sentence, store named entity list for each sentence
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({'Sentence': sent, 'entities': entity_list})

sent_entity_df = pd.DataFrame(sent_entity_df)

In [24]:
sent_entity_df

Unnamed: 0,Sentence,entities
0,"(MUHAMMAD, \n\n, his, life, based, on, the, ea...","[II, 4, Quraysh, Hollow, 6, The Vow to Sacrifi..."
1,"(2, 3, \n\n, IX, \n\n, Two, Bereavements, \n\n...","[2 3, Two, 27, Bahlra the Monk, 29, 3i, 33, 37]"
2,"(The, Rebuilding, of, the, Ka’bah, \n\n, 4i, \...","[The Rebuilding of the Ka’bah, 4i, 43, 46, War..."
3,"(50, \n\n, XVIII, \n\n, Quraysh, Take, Action,...",[50]
4,"(5, 2, \n\n, XIX, \n\n, Aws, and, Khazraj, \n\...","[5 2, 56, XX, Abu Jahl, Hamzah, 58]"
...,...,...
7503,"(They, still, had, in, their, ears, the, sound...",[]
7504,"(Your, tryst, with, me, is, at, the, \n, Pool,...",[]
7505,"(Having, delivered, his, message, in, this, wo...","[Hereafter, the Key of Mercy, 3, Paradise, the..."
7506,"(Verily, God, and, His, angels, whelm, in, ble...",[]


In [42]:
# Function to filter out non-character entities
def filter_entity(ent_list, scholars_name):
    return[ent for ent in ent_list
           if ent in list(scholars_name.name)
           or ent in list(scholars_name.firstname)]

In [43]:
filter_entity(["Muhammad", "Ali", "Izzham"], scholars_name)

['Muhammad', 'Ali']

In [44]:
sent_entity_df['scholars_entities'] = sent_entity_df['entities'].apply(lambda x : filter_entity(x, scholars_name))

# filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['scholars_entities'].map(len) > 0]
sent_entity_df_filtered

Unnamed: 0,Sentence,entities,scholars_entities
0,"(MUHAMMAD, \n\n, his, life, based, on, the, ea...","[II, 4, Quraysh, Hollow, 6, The Vow to Sacrifi...",[Quraysh]
5,"(60, \n\n, XXII, \n\n, Leaders, of, Quraysh, \...","[60, Quraysh, 64, XXIII \n\nWonderment, 67, 70...",[Quraysh]
16,"(The, Succession, and, the, Burial, \n\n, 342,...","[342, Steven W. Johnson, 346, Quraysh, Hollow,...",[Quraysh]
36,"(2, Muhammad, \n\n\n, which, must, not, flow, ...","[2, Muhammad, Hagar, His Angels]",[Muhammad]
64,"(“, It, descended, from, Paradise, whiter, \n,...","[Paradise, Adam]",[Adam]
...,...,...,...
7464,"(344, Muhammad, \n\n, bidding, them, pledge, t...","[344, Muhammad, Abu Bakr, second, two, 1]",[Muhammad]
7465,"(A, recent, Revelation, had, recalled, the, \n...","[Revelation, Abu Bakr, Prophet, 2, one]",[Prophet]
7473,"(4, \n\n, After, the, prayer, the, Prophet, ’s...","[4, Prophet]",[Prophet]
7476,"(‘, Abbas, and, his, sons, Fadl, and, \n, Qith...","[Abbas, Fadl, Qitham, Usamah, Shuqran, one, Pr...","[Abbas, Fadl, Usamah, Shuqran, Prophet]"


In [45]:
sent_entity_df_filtered['scholars_entities'] = sent_entity_df_filtered['scholars_entities'].apply(lambda x: [item.split()[0] for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['scholars_entities'] = sent_entity_df_filtered['scholars_entities'].apply(lambda x: [item.split()[0] for item in x])


In [46]:
sent_entity_df_filtered

Unnamed: 0,Sentence,entities,scholars_entities
0,"(MUHAMMAD, \n\n, his, life, based, on, the, ea...","[II, 4, Quraysh, Hollow, 6, The Vow to Sacrifi...",[Quraysh]
5,"(60, \n\n, XXII, \n\n, Leaders, of, Quraysh, \...","[60, Quraysh, 64, XXIII \n\nWonderment, 67, 70...",[Quraysh]
16,"(The, Succession, and, the, Burial, \n\n, 342,...","[342, Steven W. Johnson, 346, Quraysh, Hollow,...",[Quraysh]
36,"(2, Muhammad, \n\n\n, which, must, not, flow, ...","[2, Muhammad, Hagar, His Angels]",[Muhammad]
64,"(“, It, descended, from, Paradise, whiter, \n,...","[Paradise, Adam]",[Adam]
...,...,...,...
7464,"(344, Muhammad, \n\n, bidding, them, pledge, t...","[344, Muhammad, Abu Bakr, second, two, 1]",[Muhammad]
7465,"(A, recent, Revelation, had, recalled, the, \n...","[Revelation, Abu Bakr, Prophet, 2, one]",[Prophet]
7473,"(4, \n\n, After, the, prayer, the, Prophet, ’s...","[4, Prophet]",[Prophet]
7476,"(‘, Abbas, and, his, sons, Fadl, and, \n, Qith...","[Abbas, Fadl, Qitham, Usamah, Shuqran, one, Pr...","[Abbas, Fadl, Usamah, Shuqran, Prophet]"


### Create Relationship

In [48]:
windows_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]) :
    end_i = min(i+5, sent_entity_df_filtered.index[-1]) #if ada yang kurang 5 line
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].scholars_entities),[])
    
    #remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list))
                  if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1 :
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source":a, "target":b})

In [49]:
relationship_df = pd.DataFrame(relationships)

In [50]:
relationship_df

Unnamed: 0,source,target
0,Quraysh,Zuhrah
1,Quraysh,Zuhrah
2,Zuhrah,Quraysh
3,Quraysh,Zuhrah
4,Zuhrah,Quraysh
...,...,...
9893,Abbas,Fadl
9894,Fadl,Usamah
9895,Usamah,Shuqran
9896,Shuqran,Prophet


In [51]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=relationship_df.columns)
relationship_df

Unnamed: 0,source,target
0,Quraysh,Zuhrah
1,Quraysh,Zuhrah
2,Quraysh,Zuhrah
3,Quraysh,Zuhrah
4,Quraysh,Zuhrah
...,...,...
9893,Abbas,Fadl
9894,Fadl,Usamah
9895,Shuqran,Usamah
9896,Prophet,Shuqran


In [52]:
relationship_df['values'] = 1
relationship_df = relationship_df.groupby(['source', 'target'], sort=False, as_index=False).sum()
relationship_df

Unnamed: 0,source,target,values
0,Quraysh,Zuhrah,28
1,Hashim,Quraysh,25
2,Hashim,Zuhrah,9
3,Aws,Salma,5
4,Amr,Salma,6
...,...,...,...
496,Fadl,Thawban,12
497,Anas,Thawban,5
498,Abbas,Thawban,5
499,Shuqran,Usamah,6
