In [1]:
import pandas as pd
import pickle

In [2]:
# loading the not-yet-installed mlna package in the notebook:
import sys
import os

# Get the absolute path of the "root" directory:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the "root" directory to sys.path:
if root_dir not in sys.path:
    sys.path.insert(0, root_dir)

# Now you can import the modules from the "mlna" package:
"""
the user should have installed the package on their computer. They don't need this installation method.
"""
from mlna import network, preproc, user_input 

**Beginning of preparing material for the tests:**

In [3]:
data_path= "./iran_telegraph_data/app_data"
text_df= pd.read_pickle(f'{data_path}/test_df.pickle')
text_df

Unnamed: 0,text_id,title,full_text
0,a_1,ناصرالدین شاه و سفر فرنگ,ناصر الدین شاه، پادشاه ایران، به همراه امین ال...
1,b_2,Qajar king and modernity,Naser-al-Din Shah was the king of Persia. He t...
2,c_3,Persien und neue Medien,Telegrafie wurde in the Qajar-Zeit im Iran vor...


In [4]:
entity_tags= ['PERSON', 'GPE'] 
user_ents=['telegraph']

In [5]:
# if the user wants to create a dictionary from scratch:
user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path='test_dict.pickle', threshold=80)
user_dict

Do you wish to see all similar entities and define a constant spelling for them? Enter 'y' for YES and 'n' for NO:  n





Enter the standard spelling of an entity:  done



Enter all vatiations of 'done' that exist among the entities. Enter 'done' to exit. Enter 'next' to set another standard spelling for another entity.



Enter a varying spelling of 'done':  done


{'Nasser al -Din Shah': 'Nasser-al-Din Shah',
 'Naser-al-Din Shah': 'Nasser-al-Din Shah',
 'Nasser Al Din': 'Nasser-al-Din Shah',
 'Persia': 'Iran'}

In [6]:
user_dict= pd.read_pickle(f'{data_path}/telegraph_user_dict.pickle')
user_dict

{'Mirza Ghafar Khan': 'Mirza Jafar Khan',
 'Mirzā Jaʿfar Khan': 'Mirza Jafar Khan',
 'Peterburg': 'St Petersburg',
 'Telegraph': 'telegraph',
 'TelegraphNo': 'telegraph',
 'Telegraphs': 'telegraph',
 'ʿAliqoli Khan': 'Ali Gholi Khan',
 'Teheran': 'Tehran',
 'Mushir al -Dawlah': 'Moshir-al-Dawlah',
 'Moshir al -Dawlah': 'Moshir-al-Dawlah',
 'networkThe Indo-European Telegraph': 'IETD',
 'The Indo-European Telegraph Co.': 'IETC',
 'the Indo-European Telegraph Company': 'IETC',
 'The Indo-European Telegraph Department': 'IETD',
 'Indo European Telegraph Department': 'IETD',
 'Indo-European Telegraph Company': 'IETC',
 'The Indo-European Telegraph Company': 'IETC',
 'the Indo-European Telegraph Department': 'IETD',
 'the Indo-European Government Telegraph Department': 'IETD',
 'india': 'India',
 'the Government of India': 'British Government',
 'the British Government of India': 'British Government',
 'the British Government': 'British Government',
 'Bushire': 'Bushehr',
 'Boushir': 'Bushe

In [7]:
for i, row in text_df.iterrows():
    text= text_df.loc[i, "full_text"]
    text_id= text_df.loc[i, "text_id"]
    entities= preproc.extract_entities (text, text_id, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
    print (entities)
    print()

{'text_id': 'a_1', 'sentences': ['Naser-al-Din Shah, the king of Iran, traveled with Amin al -Sultan.', 'He first traveled to Moscow and then to Paris.', 'He was introduced to the telegraph during the Paris trip.'], 'entities': [['Naser-al-Din Shah', 'Iran', 'Amin al', '-Sultan'], ['Moscow', 'Paris'], ['Paris', 'telegraph']]}

{'text_id': 'b_2', 'sentences': ['Naser-al-Din Shah was the king of Iran.', 'He travelled to Paris from Tehran.', 'He visited Adolphe Thiers on his trip.'], 'entities': [['Naser-al-Din Shah', 'Iran'], ['Paris', 'Tehran'], ['Adolphe Thiers']]}

{'text_id': 'c_3', 'sentences': ['telegraphy was presented in the Qajar period in Iran.', "The Iran king in Tehran, Nasser Al Din, learned with the modern media during his trip to Paris, in Amin-Al-Saltana's accompaniment."], 'entities': [['Qajar', 'Iran'], ['Iran', 'Tehran', 'Nasser Al Din', 'Paris', 'Amin-Al-Saltana']]}



In [8]:
network_df= network.get_network_data (text_df, entity_tags, user_ents=user_ents, user_dict=user_dict)
network_df

Unnamed: 0,text_id,source,target,weight
0,a_1,Naser-al-Din Shah,Iran,2
1,a_1,Naser-al-Din Shah,Amin al,1
2,a_1,Naser-al-Din Shah,-Sultan,1
3,a_1,Moscow,Paris,1
4,a_1,Paris,telegraph,1
5,b_2,Naser-al-Din Shah,Iran,2
6,b_2,Paris,Tehran,1
7,c_3,Qajar,Iran,1
8,c_3,Iran,Tehran,1
9,c_3,Iran,Nasser Al Din,1


In [9]:
list(map(lambda x: x.lower(), network_df['source']))

['naser-al-din shah',
 'naser-al-din shah',
 'naser-al-din shah',
 'moscow',
 'paris',
 'naser-al-din shah',
 'paris',
 'qajar',
 'iran',
 'iran',
 'iran',
 'iran']

In [10]:
list(map(lambda x: x.lower(), network_df['target']))

['iran',
 'amin al',
 '-sultan',
 'paris',
 'telegraph',
 'iran',
 'tehran',
 'iran',
 'tehran',
 'nasser al din',
 'paris',
 'amin-al-saltana']

In [11]:
select_nodes= user_input.select_nodes (text_df, entity_tags, user_ents=user_ents, user_dict=user_dict)
select_nodes

Enter the names of as many nodes as you wish. Press 'ENTER' to type the next name. Enter 'done' to exit.



Enter the name of a node:  done


[]

**End of preparing material for the tests. Delete test_df.pickle from here when tests finished. It should only exist in the tests directory.**

In [12]:
# loading the texts and their metadata into the code from a pickled dataframe:
data_path= "./iran_telegraph_data/app_data"
text_df= pd.read_excel(f'{data_path}/telegraph_data.xlsx')

text_df

Unnamed: 0,text_id,file_name,title,source,year,full_text
0,t_m_1277,1277_تاریخ منتظم ناصری.txt,اعتماداالسلطنه,اعتماداالسلطنه. تاریخ منتظم ناصری. جلد ۳. ص. 1833,1277,محمد ابراهیم خان سرتیپ اول افواج خمسه به لقب م...
1,g_q_1278,1278_گزارش وزیر امور خارجه درباره قرارداد تلگر...,گزارش وزیر امور خارجه درباره قرارداد تلگراف,یکصد سند تاریخی دوران قاجاریه. ابراهیم صفایی. ...,1278,قربان خاک پای جواهر آسای اقدس همایونت شوم. دست...
2,g_g_1283,1283_گزارش وزیر امور خارجه درباره گسترش تلگراف...,گزارش وزیر امور خارجه درباره گسترش تلگراف,یکصد سند تاریخی دوران قاجاریه. ابراهیم صفایی. ...,1283,قربان خاک پای جواهرآسای اقدس همایونت شوم. جواب...
3,n_s_1283,1283_نامه ساعد الملک درباره خطوط تلگرافی ایران...,نامه ساعد الملک درباره خطوط تلگرافی ایران,یکصد سند تاریخی دوران قاجاریه. ابراهیم صفایی. ...,1283,خدایگانا دربال گراندوک قسطنطنین به جناب مسیو ت...
4,p_m_1303,1303_پروگرام مشی کابینه آقای سردار سپه.txt,پروگرام مشی کابینه آقای سردار سپه,https://fa.wikisource.org/wiki/%D9%BE%D8%B1%D9...,1303,پروگرام مشی کابینهٔ آقای سردار سپهمصوب ۲۶ حمل ...
5,s_z_1314,1314_ساعت ظهر در ولایات.txt,ساعت ظهر در ولایات,https://fa.wikisource.org/wiki/%D8%A7%D8%B7%D9...,1314,ساعت ظهر در ولایاتطبق تصویب هیئت محترم وزراء ع...
6,h_a_2010,history_of_the_atlantic_cable.txt,History of the Atlantic Cable & Undersea Commu...,https://atlantic-cable.com/CableCos/Indo-Eur/i...,2010,The Indo-European Telegraph Company – was foun...
7,410_E_4_2,IOR_L_PWD_7_410_E_4_2.txt,concession of May 24,Archive of the British Library,1868,"Cher Monsieur Champagne,J'ai le plaisir de vou..."
8,410_E_4_3,IOR_L_PWD_7_410_E_4_3.txt,letters from Bateman and Andrews,Archive of the British Library,1876,The Indo-European Telegraph Co.(LIMITED.)Teleg...
9,452_E_7,IOR_L_PWD_7_452_E_7_16.txt,letter of O. St. John Lieut. R.E. to Colonel G...,Archive of the British Library,1867,Tel. Department No. 12 of 1867To:Lieut. Colone...


In [13]:
# choosing the entity categories that we would like to be included in the network graph and filtered texts:
entity_tags= user_input.get_entities()
entity_tags

Enter the entities you are looking for in the texts. Press 'ENTER' to type the next entity. Enter 'done' when finished.

PERSON : People, including fictional.
NORP : Nationalities or religious or political groups.
FAC : Buildings, airports, highways, bridges, etc.
ORG : Companies, agencies, institutions, etc.
GPE : Countries, cities, states.
LOC : Non-GPE locations, mountain ranges, bodies of water.
PRODUCT : Objects, vehicles, foods, etc. (Not services.)
EVENT : Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART : Titles of books, songs, etc.
LAW : Named documents made into laws.
LANGUAGE : Any named language.
DATE : Absolute or relative dates or periods.
MONEY : Monetary values, including unit.



Enter your choice:  PERSON
Enter your choice:  done


['PERSON']

In [14]:
# if the user wants to manually add entities not included in the entity tag categories:
user_ents=['telegraph', 'Julfa']

In [15]:
# if the user already has a dictionary saved locally:
dict_path=f"{data_path}/telegraph_user_dict.pickle"
with open(dict_path, 'rb') as f:
    user_dict = pickle.load(f)

# # if the user already has a dictionary saved locally and wants to expand it:
# dict_path=f"{data_path}telegraph_user_dict.pickle"
# user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path=dict_path, threshold=70)

# # if the user wants to create a dictionary from scratch:
# user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path=None, threshold=70)

user_dict

{'Mirza Ghafar Khan': 'Mirza Jafar Khan',
 'Mirzā Jaʿfar Khan': 'Mirza Jafar Khan',
 'Peterburg': 'St Petersburg',
 'Telegraph': 'telegraph',
 'TelegraphNo': 'telegraph',
 'Telegraphs': 'telegraph',
 'ʿAliqoli Khan': 'Ali Gholi Khan',
 'Teheran': 'Tehran',
 'Mushir al -Dawlah': 'Moshir-al-Dawlah',
 'Moshir al -Dawlah': 'Moshir-al-Dawlah',
 'networkThe Indo-European Telegraph': 'IETD',
 'The Indo-European Telegraph Co.': 'IETC',
 'the Indo-European Telegraph Company': 'IETC',
 'The Indo-European Telegraph Department': 'IETD',
 'Indo European Telegraph Department': 'IETD',
 'Indo-European Telegraph Company': 'IETC',
 'The Indo-European Telegraph Company': 'IETC',
 'the Indo-European Telegraph Department': 'IETD',
 'the Indo-European Government Telegraph Department': 'IETD',
 'india': 'India',
 'the Government of India': 'British Government',
 'the British Government of India': 'British Government',
 'the British Government': 'British Government',
 'Bushire': 'Bushehr',
 'Boushir': 'Bushe

In [None]:
"""
Manipulating the user_dict:
"""
# user_dict['Etemad al -Saltanah']= 'Etemad-al-Saltana'
# pd.to_pickle(user_dict, f"{data_path}/telegraph_user_dict.pickle")

# user_dict

In [None]:
"""
Checking if Julfa is in the entities:
"""
# text= text_df.loc[11, 'full_text']
# text_id='403_E_4_1'
# ent_dict=preproc.extract_entities (text, text_id, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
# dict_df= pd.DataFrame(ent_dict)
# dict_df

In [None]:
# # if the user only wishes to see network relations among entities from certain texts: 
# sources= ['410_E_4_2', '410_E_4_3', '452_E_7', '480_E_9_1', '403_E_4_1']

In [None]:
# visualizing the network graph:
network.visualize_network (text_df, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict, 
                           core=False, select_nodes=None, sources=None,
                           title='network_visualization_2', 
                           figsize=(1000, 700), bgcolor='black', font_color='white')

In [None]:
# visualizing the community graph:
network.detect_community (text_df, entity_tags=entity_tags, user_ents=user_ents, 
                          user_dict=user_dict, title='community_detection',
                          figsize=(1000, 700), bgcolor='black', font_color='white')

In [None]:
# if the user only wants to visualize network relations between certain nodes or filter texts that include certain nodes:
select_nodes=user_input.select_nodes(text_df, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
select_nodes

In [None]:
# selecting texts that contain certain nodes od:
filtered_texts= network.filter_network_data (text_df, select_nodes=['Naser-al-din shah'], 
                                             entity_tags=entity_tags, 
                                             user_ents=user_ents, user_dict=user_dict, operator='OR')
filtered_texts

In [None]:
# saving the filtered text to an excel file to share them with other people:
filtered_texts.to_excel(f"{data_path}/filtered_texts.xlsx", index=False)