In [2]:
import pandas as pd

In [3]:
path = '../data/raw/'
final_path = '../data/final/'

## Bible

In [4]:
bible = pd.read_csv(path + "bible_kjv.csv")

# Print the names of the columns.
print("Columns: {}".format(bible.columns))
print("Shape: {}".format(bible.shape))
print(bible.head(5))

Columns: Index(['id', 'b', 'c', 'v', 't'], dtype='object')
Shape: (31103, 5)
        id  b  c  v                                                  t
0  1001001  1  1  1  In the beginning God created the heaven and th...
1  1001002  1  1  2  And the earth was without form, and void; and ...
2  1001003  1  1  3  And God said, Let there be light: and there wa...
3  1001004  1  1  4  And God saw the light, that it was good: and G...
4  1001005  1  1  5  And God called the light Day, and the darkness...


In [5]:
print("Look at the first row")
print(bible.iloc[0])

print("\nLook at a the half way row")
print(bible.iloc[int(len(bible) /2)]) 

print("\nLook at the last row")
print(bible.iloc[len(bible) -1 ]) 

Look at the first row
id                                              1001001
b                                                     1
c                                                     1
v                                                     1
t     In the beginning God created the heaven and th...
Name: 0, dtype: object

Look at a the half way row
id                                             19103002
b                                                    19
c                                                   103
v                                                     2
t     Bless the LORD, O my soul, and forget not all ...
Name: 15551, dtype: object

Look at the last row
id                                             66022021
b                                                    66
c                                                    22
v                                                    21
t     The grace of our Lord Jesus Christ be with you...
Name: 31102, dtype: object


In [6]:
bible.drop(['id', 'b', 'c', 'v'], axis=1, inplace=True)

In [7]:
bible.columns = ['text']

In [8]:
print(bible.head(1))
bible = bible.sample(frac=1).reset_index(drop=True)
print(bible.head(1))

                                                text
0  In the beginning God created the heaven and th...
                                                text
0  The noise of a multitude in the mountains, lik...


## Quran

In [9]:
quran = pd.read_json(path+"quran.json")

# Print the names of the columns.
print("Columns: {}".format(quran.columns))
print("Shape: {}".format(quran.shape))
print(quran.head(2))

Columns: Index(['surah_number', 'verse_number', 'text', 'translation'], dtype='object')
Shape: (6236, 4)
   surah_number  verse_number                                    text  \
0             1             1  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ   
1             1             2   ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ   

                                         translation  
0  In the name of Allah, the Entirely Merciful, t...  
1  [All] praise is [due] to Allah, Lord of the wo...  


In [10]:
print("Look at the first row")
print(quran.iloc[0])

print("\nLook at a the half way row")
print(quran.iloc[int(len(quran) /2)]) 

print("\nLook at the last row")
print(quran.iloc[len(quran) -1 ])

Look at the first row
surah_number                                                    1
verse_number                                                    1
text                       بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
translation     In the name of Allah, the Entirely Merciful, t...
Name: 0, dtype: object

Look at a the half way row
surah_number                                                   26
verse_number                                                  187
text            فَأَسْقِطْ عَلَيْنَا كِسَفًا مِّنَ ٱلسَّمَآءِ...
translation     So cause to fall upon us fragments of the sky,...
Name: 3118, dtype: object

Look at the last row
surah_number                                  114
verse_number                                    6
text                   مِنَ ٱلْجِنَّةِ وَٱلنَّاسِ
translation     From among the jinn and mankind."
Name: 6235, dtype: object


In [11]:
quran.drop(['surah_number', 'verse_number', 'text'], axis=1, inplace=True)

In [12]:
quran.iloc[len(quran) -1]

translation    From among the jinn and mankind."
Name: 6235, dtype: object

In [13]:
# Shuffle dataset
print(quran.head(1))
quran = quran.sample(frac=1).reset_index(drop=True)
print(quran.head(1))

                                         translation
0  In the name of Allah, the Entirely Merciful, t...
                                         translation
0  That we might follow the magicians if they are...


In [14]:
#quran.head(1)
bible.max(axis=1)

0        The noise of a multitude in the mountains, lik...
1        For they could not keep it at that time, becau...
2        And Joshua spake unto the house of Joseph, eve...
3        Gilead is mine; Manasseh is mine; Ephraim also...
4        And said unto me, Thou art my servant, O Israe...
                               ...                        
31098    My hands also will I lift up unto thy commandm...
31099    And they lodged round about the house of God, ...
31100    And Jacob said unto Joseph, God Almighty appea...
31101    And when thou art come in, thou shalt shut the...
31102    For it is a shame even to speak of those thing...
Length: 31103, dtype: object

### Union the dataframes together

In [15]:
rel_texts = pd.concat([bible, quran], sort=False)

In [16]:
print(rel_texts.shape)

(37339, 2)


In [17]:
print(rel_texts.head(1))
rel_texts = rel_texts.sample(frac=1).reset_index(drop=True)
print(rel_texts.head(1))

                                                text translation
0  The noise of a multitude in the mountains, lik...         NaN
                                                text translation
0  Beside the chief of Solomon's officers which w...         NaN


## Save the new dataframe

In [18]:
file_name = final_path + 'religiousTexts.csv'

In [20]:
import os
import csv
if not os.path.isdir(final_path):
   os.makedirs(final_path)
rel_texts.to_csv(file_name, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)