In [12]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio

# Load data from reference file
def load_reference(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(line.strip())
    return pd.DataFrame(data, columns=['Reference'])

# Load data from two text files
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for idx, line in enumerate(file):
            score = float(line.strip())
            data.append(score)
    return pd.DataFrame(data, columns=['Score'])

# Load verse references
data_ref = load_reference('references/vref.txt')

# Load data from two files
data1 = load_data('sim/wol-eng-labse-sim_scores.txt')
data2 = load_data('sim/wol-eng-laser-sim-scores.txt')

# Merge reference data with scores
data1['Reference'] = data_ref['Reference']
data2['Reference'] = data_ref['Reference']

# Merge data on Reference
merged_data = pd.merge(data1, data2, on='Reference', suffixes=('_file1', '_file2'))

# Extract Book and Verse information
merged_data[['Book', 'Chapter', 'Verse']] = merged_data['Reference'].str.extract(r'([A-Z]+) (\d+):(\d+)$')

# Drop rows with NaN values after extraction
merged_data.dropna(subset=['Book', 'Chapter', 'Verse'], inplace=True)

# Convert Chapter and Verse to integers
merged_data['Chapter'] = merged_data['Chapter'].astype(int)
merged_data['Verse'] = merged_data['Verse'].astype(int)

# Check if there are enough data points to calculate correlations
if len(merged_data) < 2:
    print("Not enough data points to calculate correlations.")
else:
    # Calculate correlations
    spearman_corr, _ = stats.spearmanr(merged_data['Score_file1'], merged_data['Score_file2'])
    pearson_corr, _ = stats.pearsonr(merged_data['Score_file1'], merged_data['Score_file2'])

    print(f"Spearman Correlation: {spearman_corr}")
    print(f"Pearson Correlation: {pearson_corr}")

    # Plotting Scatter Plot for Correlation using Plotly
    scatter_plot = px.scatter(
        merged_data, x='Score_file1', y='Score_file2',
        title='Scatter Plot of Scores from File 1 vs File 2',
        labels={'Score_file1': 'Score from File 1', 'Score_file2': 'Score from File 2'},
    )
    scatter_plot.update_layout(
        xaxis_title='Score from File 1',
        yaxis_title='Score from File 2',
        template='plotly_white'
    )
    scatter_plot.show()

# Detect outliers using IQR
def detect_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_file1 = detect_outliers_iqr(merged_data, 'Score_file1')
outliers_file2 = detect_outliers_iqr(merged_data, 'Score_file2')

if not outliers_file1.empty:
    outliers_file1 = outliers_file1.sort_values(by='Score_file1', ascending=False)
    print("Outliers in file 1 (using IQR method):")
    print(outliers_file1)
else:
    print("No outliers detected in file 1.")

if not outliers_file2.empty:
    outliers_file2 = outliers_file2.sort_values(by='Score_file2', ascending=False)
    print("Outliers in file 2 (using IQR method):")
    print(outliers_file2)
else:
    print("No outliers detected in file 2.")

# Create dropdown to filter by book
books = merged_data['Book'].unique()
dropdown = widgets.Dropdown(
    options=list(books),
    description='Select Book:',
    disabled=False,
)

# Function to display data for selected book
def display_data(book):
    filtered_data = merged_data[merged_data['Book'] == book]
    if not filtered_data.empty:
        display(filtered_data)
    else:
        print(f"No data available for book: {book}")

# Link dropdown with the display function
widgets.interact(display_data, book=dropdown)

Spearman Correlation: 0.976343261754285
Pearson Correlation: 0.9254314409442465


Outliers in file 1 (using IQR method):
       Score_file1  Reference  Score_file2 Book  Chapter  Verse
23040     1.000000   ZEC 7:10     1.000000  ZEC        7     10
23160     1.000000    MAL 1:3     1.000000  MAL        1      3
23192     1.000000    MAL 3:4     1.000000  MAL        3      4
23193     1.000000    MAL 3:5     1.000000  MAL        3      5
23194     1.000000    MAL 3:6     1.000000  MAL        3      6
...            ...        ...          ...  ...      ...    ...
29418    -0.123385   EPH 6:15     0.189353  EPH        6     15
28073    -0.125368   ROM 3:15     0.361031  ROM        3     15
27061    -0.125988   ACT 2:44     0.096052  ACT        2     44
24655    -0.135831   MRK 9:49     0.461968  MRK        9     49
30212    -0.139460  HEB 10:13     0.404209  HEB       10     13

[8192 rows x 6 columns]
Outliers in file 2 (using IQR method):
       Score_file1  Reference  Score_file2 Book  Chapter  Verse
26890     0.583480  JHN 18:37     0.870793  JHN       18     37
2

interactive(children=(Dropdown(description='Select Book:', options=('GEN', 'EXO', 'LEV', 'NUM', 'DEU', 'JOS', …

<function __main__.display_data(book)>

In [18]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio

# Load data from reference file
def load_reference(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(line.strip())
    return pd.DataFrame(data, columns=['Reference'])

# Load data from two text files
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for idx, line in enumerate(file):
            parts = line.strip().split()
            if len(parts) > 1:  # Ensure there are enough parts to extract the score
                try:
                    score = float(parts[-1])  # Extract the last element as the score
                    data.append(score)
                except ValueError:
                    print(f"Skipping line {idx + 1} due to invalid format: {line.strip()}")
    return pd.DataFrame(data, columns=['Score'])

# Load verse references
data_ref = load_reference('references/vref.txt')

# Load data from two files
# data1 = load_data('sim/wol-eng-labse-sim_scores.txt')
# data2 = load_data('sim/wol-eng-laser-sim-scores.txt')
data1 = load_data('sim/tam-eng-laser-sim-scores.txt')
data2 = load_data('sim/tur-eng-laser-sim-scores.txt')

# Merge reference data with scores
data1['Reference'] = data_ref['Reference']
data2['Reference'] = data_ref['Reference']

# Merge data on Reference
merged_data = pd.merge(data1, data2, on='Reference', suffixes=('_file1', '_file2'))

# Extract Book and Verse information
merged_data[['Book', 'Chapter', 'Verse']] = merged_data['Reference'].str.extract(r'([A-Z]+) (\d+):(\d+)$')

# Drop rows with NaN values after extraction
merged_data.dropna(subset=['Book', 'Chapter', 'Verse'], inplace=True)

# Convert Chapter and Verse to integers
merged_data['Chapter'] = merged_data['Chapter'].astype(int)
merged_data['Verse'] = merged_data['Verse'].astype(int)

# Add verse name column
merged_data['Verse_Name'] = merged_data['Book'] + ' ' + merged_data['Chapter'].astype(str) + ':' + merged_data['Verse'].astype(str)

# Save merged data to CSV
merged_data.to_csv('references/merged_data.csv', index=False)

# Check if there are enough data points to calculate correlations
if len(merged_data) < 2:
    print("Not enough data points to calculate correlations.")
else:
    # Calculate correlations
    spearman_corr, _ = stats.spearmanr(merged_data['Score_file1'], merged_data['Score_file2'])
    pearson_corr, _ = stats.pearsonr(merged_data['Score_file1'], merged_data['Score_file2'])

    print(f"Spearman Correlation: {spearman_corr}")
    print(f"Pearson Correlation: {pearson_corr}")

    # Plotting Scatter Plot for Correlation using Plotly
    scatter_plot = px.scatter(
        merged_data, x='Score_file1', y='Score_file2',
        title='Scatter Plot of Scores from Labse vs LASER',
        labels={'Score_file1': 'Labse', 'Score_file2': 'LASER'},
        hover_data=['Verse_Name']
    )
    scatter_plot.update_layout(
        xaxis_title='Labse',
        yaxis_title='LASER',
        template='plotly_white'
    )
    scatter_plot.show()

# Detect outliers using IQR
def detect_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_file1 = detect_outliers_iqr(merged_data, 'Score_file1')
outliers_file2 = detect_outliers_iqr(merged_data, 'Score_file2')

if not outliers_file1.empty:
    outliers_file1 = outliers_file1.sort_values(by='Score_file1', ascending=False)
    print("Outliers in file 1 (using IQR method):")
    print(outliers_file1[['Verse_Name', 'Score_file1']])
else:
    print("No outliers detected in file 1.")

if not outliers_file2.empty:
    outliers_file2 = outliers_file2.sort_values(by='Score_file2', ascending=False)
    print("Outliers in file 2 (using IQR method):")
    print(outliers_file2[['Verse_Name', 'Score_file2']])
else:
    print("No outliers detected in file 2.")

# Create dropdown to filter by book
books = merged_data['Book'].unique()
dropdown = widgets.Dropdown(
    options=list(books),
    description='Select Book:',
    disabled=False,
)

# Function to display data for selected book
def display_data(book):
    filtered_data = merged_data[merged_data['Book'] == book]
    if not filtered_data.empty:
        display(filtered_data[['Verse_Name', 'Score_file1', 'Score_file2']])
    else:
        print(f"No data available for book: {book}")

# Link dropdown with the display function
widgets.interact(display_data, book=dropdown)

Spearman Correlation: 0.5311703585192606
Pearson Correlation: 0.4167990534546573


No outliers detected in file 1.
No outliers detected in file 2.


interactive(children=(Dropdown(description='Select Book:', options=('GEN', 'EXO', 'LEV', 'NUM', 'DEU', 'JOS', …

<function __main__.display_data(book)>