In [1]:
import pandas as pd
import pickle

In [2]:
# loading the not-yet-installed mlna package in the notebook:
import sys
import os

# Get the absolute path of the "root" directory:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the "root" directory to sys.path:
if root_dir not in sys.path:
    sys.path.insert(0, root_dir)

# Now you can import the modules from the "mlna" package:
from mlna import network, preproc, user_input 

**Beginning of preparing material for the tests:**

In [3]:
data_path= "./iran_telegraph_data/app_data"
text_df= pd.read_pickle(f'{data_path}/test_df.pickle')
text_df

Unnamed: 0,text_id,title,full_text
0,a_1,ناصرالدین شاه و سفر فرنگ,ناصر الدین شاه، پادشاه ایران، به همراه امین ال...
1,b_2,Qajar king and modernity,Naser-al-Din Shah was the king of Persia. He t...
2,c_3,Persien und neue Medien,Telegrafie wurde in the Qajar-Zeit im Iran vor...


In [4]:
entity_tags= ['PERSON', 'GPE'] 
user_ents=['telegraph']

In [26]:
# if the user wants to create a dictionary from scratch:
user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path='test_dict.pickle', threshold=80)
user_dict

Do you wish to see all similar entities and define a constant spelling for them? Enter 'y' for YES and 'n' for NO:  n





Enter the standard spelling of an entity:  done



Enter all vatiations of 'done' that exist among the entities. Enter 'done' to exit. Enter 'next' to set another standard spelling for another entity.



Enter a varying spelling of 'done':  done


{}

In [5]:
user_dict= pd.read_pickle('test_dict.pickle')
user_dict

{'Nasser al -Din Shah': 'Nasser-al-Din Shah',
 'Naser-al-Din Shah': 'Nasser-al-Din Shah',
 'Nasser Al Din': 'Nasser-al-Din Shah',
 'Persia': 'Iran'}

In [6]:
for i, row in text_df.iterrows():
    text= text_df.loc[i, "full_text"]
    text_id= text_df.loc[i, "text_id"]
    entities= preproc.extract_entities (text, text_id, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
    print (entities)
    print()

{'text_id': 'a_1', 'sentences': ['Nasser-al-Din Shah, the king of Iran, traveled with Amin al -Sultan.', 'He first traveled to Moscow and then to Paris.', 'He was introduced to the telegraph during the Paris trip.'], 'entities': [['Nasser-al-Din Shah', 'Iran', 'Amin al', '-Sultan'], ['Moscow', 'Paris'], ['Paris', 'telegraph']]}

{'text_id': 'b_2', 'sentences': ['Nasser-al-Din Shah was the king of Iran.', 'He travelled to Paris from Tehran.', 'He visited Adolphe Thiers on his trip.'], 'entities': [['Nasser-al-Din Shah', 'Iran'], ['Paris', 'Tehran'], ['Adolphe Thiers']]}

{'text_id': 'c_3', 'sentences': ['Telegraphy was presented in the Qajar period in Iran.', "The Irann king in Tehran, Nasser-al-Din Shah, learned with the modern media during his trip to Paris, in Amin-Al-Saltana's accompaniment."], 'entities': [['Qajar', 'Iran'], ['Tehran', 'Nasser-al-Din Shah', 'Paris', 'Amin-Al-Saltana']]}



In [10]:
network_df= network.get_network_data (text_df, entity_tags, user_ents=user_ents, user_dict=user_dict)
network_df

Unnamed: 0,text_id,source,target,weight
0,a_1,Nasser-al-Din Shah,Iran,2
1,a_1,Nasser-al-Din Shah,Amin al,1
2,a_1,Nasser-al-Din Shah,-Sultan,1
3,a_1,Moscow,Paris,1
4,a_1,Paris,telegraph,1
5,b_2,Nasser-al-Din Shah,Iran,2
6,b_2,Paris,Tehran,1
7,c_3,Qajar,Iran,1
8,c_3,Tehran,Nasser-al-Din Shah,1
9,c_3,Tehran,Paris,1


In [11]:
list(map(lambda x: x.lower(), network_df['source']))

['nasser-al-din shah',
 'nasser-al-din shah',
 'nasser-al-din shah',
 'moscow',
 'paris',
 'nasser-al-din shah',
 'paris',
 'qajar',
 'tehran',
 'tehran',
 'tehran']

In [12]:
list(map(lambda x: x.lower(), network_df['target']))

['iran',
 'amin al',
 '-sultan',
 'paris',
 'telegraph',
 'iran',
 'tehran',
 'iran',
 'nasser-al-din shah',
 'paris',
 'amin-al-saltana']

In [6]:
select_nodes= user_input.select_nodes (text_df, entity_tags, user_ents=user_ents, user_dict=user_dict)
select_nodes

Enter the names of as many nodes as you wish. Press 'ENTER' to type the next name. Enter 'done' to exit.



Enter the name of a node:  qajar
Enter the name of a node:  nasser-al-din shah
Enter the name of a node:  done


['qajar', 'nasser-al-din shah']

**End of preparing material for the tests. Delete test_df.pickle from here when tests finished. It should only exist in the tests directory.**

In [None]:
# loading the texts and their metadata into the code from a pickled dataframe:
data_path= "./iran_telegraph_data/app_data"
text_df= pd.read_excel(f'{data_path}/telegraph_data.xlsx')

text_df

In [None]:
# choosing the entity categories that we would like to be included in the network graph and filtered texts:
entity_tags= user_input.get_entities()
entity_tags

In [None]:
# if the user wants to manually add entities not included in the entity tag categories:
user_ents=['telegraph', 'Julfa']

In [None]:
# if the user already has a dictionary saved locally:
dict_path=f"{data_path}/telegraph_user_dict.pickle"
with open(dict_path, 'rb') as f:
    user_dict = pickle.load(f)

# # if the user already has a dictionary saved locally and wants to expand it:
# dict_path=f"{data_path}telegraph_user_dict.pickle"
# user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path=dict_path, threshold=70)

# # if the user wants to create a dictionary from scratch:
# user_dict= user_input.make_user_dict(text_df, entity_tags=entity_tags, user_ents=user_ents, dict_path=None, threshold=70)

user_dict

In [None]:
"""
Manipulating the user_dict:
"""
# user_dict['Etemad al -Saltanah']= 'Etemad-al-Saltana'
# pd.to_pickle(user_dict, f"{data_path}/telegraph_user_dict.pickle")

# user_dict

In [None]:
"""
Checking if Julfa is in the entities:
"""
# text= text_df.loc[11, 'full_text']
# text_id='403_E_4_1'
# ent_dict=preproc.extract_entities (text, text_id, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
# dict_df= pd.DataFrame(ent_dict)
# dict_df

In [None]:
# # if the user only wishes to see network relations among entities from certain texts: 
# sources= ['410_E_4_2', '410_E_4_3', '452_E_7', '480_E_9_1', '403_E_4_1']

In [None]:
# visualizing the network graph:
network.visualize_network (text_df, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict, 
                           core=False, select_nodes=None, sources=None,
                           title='network_visualization_2', 
                           figsize=(1000, 700), bgcolor='black', font_color='white')

In [None]:
# visualizing the community graph:
network.detect_community (text_df, entity_tags=entity_tags, user_ents=user_ents, 
                          user_dict=user_dict, title='community_detection',
                          figsize=(1000, 700), bgcolor='black', font_color='white')

In [None]:
# if the user only wants to visualize network relations between certain nodes or filter texts that include certain nodes:
select_nodes=user_input.select_nodes(text_df, entity_tags=entity_tags, user_ents=user_ents, user_dict=user_dict)
select_nodes

In [None]:
# selecting texts that contain certain nodes od:
filtered_texts= network.filter_network_data (text_df, select_nodes=['Naser-al-din shah'], 
                                             entity_tags=entity_tags, 
                                             user_ents=user_ents, user_dict=user_dict, operator='OR')
filtered_texts

In [None]:
# saving the filtered text to an excel file to share them with other people:
filtered_texts.to_excel(f"{data_path}/filtered_texts.xlsx", index=False)