<div>
<h1><center> <font color='black'> Network Science: </font></center></h1>
<h2><center> <font color='black'> Project: Network Analyses of Estonian Runosongs </font></center></h2>
</div>

In [23]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sb
from scipy import stats


In [424]:
#importing data from runo2.csv that was created based on the sql view runo2
df = pd.read_csv("runo2.csv", sep=",")


In [426]:
#data exploration
print(df.shape) # number of rows and columns
print(df['name'].nunique()) # number of unique song types
print(df['Kihelkond'].nunique()) #number of parishes
df.head()

(74286, 6)
6321
119


Unnamed: 0,poem_id,display_name,kihelkonna_id,Kihelkond,name,tyyp_id
0,150103,"H, Ostrov 93 (29)",858,Põltsamaa,?Halb rehepeksja,20473
1,100414,"ERA II 26, 285/7 (11)",883,Rõuge,"""""Netsütämine"""",,\n24360,erab_orig14030,""""""Kiv...",24359
2,193843,"H IV 1, 318/9 (5)",795,Koeru,(?) Kuri naine,22783
3,194175,"H IV 2, 92/3 (20 b)",815,Karuse,(?) Külanaine,22850
4,193873,"H IV 1, 369 (3)",798,Türi,(?) Leigeri mäng,22791


In [428]:
#Filtering out song types with less then 2 occurances (e.g. no chance of forming edges).
#Adding a column counting number of occurances for every song type
df['Count_of_songs_in_group'] = df.groupby('name')['name'].transform('count')
df = df[df['Count_of_songs_in_group'] >=2].sort_values(['Count_of_songs_in_group', 'name'])
df

Unnamed: 0,poem_id,display_name,kihelkonna_id,Kihelkond,name,tyyp_id,Count_of_songs_in_group
36,150165,"H, Ostrov 143/4 (22)",869,Palamuse,4 neida,20506,2
37,150165,"H, Ostrov 143/4 (22)",876,Torma,4 neida,20506,2
38,120944,E 1057 (4),785,Rakvere,Aadamal oli seitse poega,21262,2
39,120944,E 1057 (4),786,Simuna,Aadamal oli seitse poega,21262,2
44,157251,"H II 1, 179 (254)",782,Jõhvi,Aeg koju minna,10190,2
...,...,...,...,...,...,...,...
4101,193367,"H III 30, 873/6",895,Võru l.,Ema haual,8159,1181
4102,149820,"H, Mapp 700 (8)",897,Valga,Ema haual,8159,1181
4103,149829,"H, Mapp 713/4",897,Valga,Ema haual,8159,1181
4104,182256,"H II 56, 1051/2 (6)",897,Valga,Ema haual,8159,1181


In [430]:
#Let's see how many rows and song types we lost
print(df.shape) # number of rows and columns
print(df['name'].nunique()) # number of unique song types
print(df['Kihelkond'].nunique()) #number of parishes

(70669, 7)
2704
119


In [185]:
#Some parishes have many 'copies' of the same song type (e.g. parish AND song type are the same in 2 or more rows). 
#Since information how many times a song type occurs in the same parish (or by how many specific song variations a song type is 
# represented in a parish) is not really very useful and it hinders later analyses, these rows should either be dropped or later dealt with. 

In [508]:
#Getting lists of songs for each parish -> based on this dataframe node and edges lists with weights will be created.
kihelkond_laulutyybid = pd.DataFrame(df.groupby('Kihelkond')['name'].unique())
kihelkond_laulutyybid.reset_index(level=0, inplace=True)
kihelkond_laulutyybid

Unnamed: 0,Kihelkond,name
0,Ambla,"[Armsale lähen saiatagi, Ei ole lauljaid hulga..."
1,Anna,"[Kus sa käisid, sokukene?, Küla laitus, Kas sa..."
2,Anseküla,"[Panen hõlbule tööle., Veski tegemine, Pääslas..."
3,Audru,"[Härra rehkendab teopäevi, Ilmatark, Kaval teo..."
4,Emmaste,"[Sõnad koera hammustamise vastu, Mardi palve, ..."
...,...,...
114,Väike-Maarja,"[Hoiatus uhka naise eest, Lindude sasitud vili..."
115,Vändra,"[Ise tantsin - taha vaatan, Karja-Tõnu, Kena n..."
116,Võnnu,"[Hobuse suits, Ketran siidi, Leelo taevast too..."
117,Võru l.,"[Poiss tuleb neiude tulele, Kõik puud pole ühe..."
