In [39]:
import os
import numpy as np
import pandas as pd
import altair as alt
import krippendorff
alt.data_transformers.enable('default', max_rows=100000)

DataTransformerRegistry.enable('default')

In [25]:
# Get all files in ../data/raw/sts/answers/ 
dfs = []
chart_dfs = []
STS_FILES_PATH = '../data/raw/sts/answers/ '
for eval_file in os.listdir(STS_FILES_PATH):
    # Open the tsv file and read it into a pandas dataframe
    if eval_file.endswith('.tsv'):
        df = pd.read_csv(STS_FILES_PATH + eval_file, sep='\t')
        dfs.append(df)
        # Iterate over the rows of the dataframe
        # and create a list of tuples with the following format:krippendorff.alpha(value_counts=ratings_df.to_numpy(), level_of_measurement='ordinal')
        # (number of 0 ratings, number of 1 ratings, number of 2 ratings, number of 3 ratings, number of 4 ratings, number of 5 ratings)
        all_ratings = []
        for index, row in df.iterrows():
            ratings = {
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 0
            }
            for i in range(1, 6):
                ratings[row['eval_' + str(i)]] += 1
            all_ratings.append(ratings)
        
        ratings_list = [el for rating in all_ratings for el in list(rating.values())]
        ratings_list_df = pd.DataFrame(ratings_list)
        ratings_list_df.rename(columns={0: 'ratings'}, inplace=True)
        ratings_list_df['source'] = eval_file
        chart_dfs.append(ratings_list_df)
        
        # Create dataframe from all_ratings
        ratings_df = pd.DataFrame(all_ratings)
        print(f"Krippendorff's alpha in {eval_file} is {krippendorff.alpha(value_counts=ratings_df.to_numpy(), level_of_measurement='ordinal')}")

Krippendorff's alpha in answers-images.tsv is 0.8197942763831926
Krippendorff's alpha in answers-students.tsv is 0.7168269330801006
Krippendorff's alpha in answers-headlines.tsv is 0.7928138948003643
Krippendorff's alpha in answers-belief.tsv is 0.6330337731096631
Krippendorff's alpha in answers-forums.tsv is 0.6562963509909604


In [26]:
chart_dfs = pd.concat(chart_dfs)

In [27]:
# Remove .tsv and answers- from source column
chart_dfs['source'] = chart_dfs['source'].str.replace('.tsv', '')
chart_dfs['source'] = chart_dfs['source'].str.replace('answers-', 'sts-')

  chart_dfs['source'] = chart_dfs['source'].str.replace('.tsv', '')


In [28]:
#Now load our data into a dataframe
lyricsim_df = pd.read_csv('../data/processed/filtered_detailed_results.csv')

In [29]:
# Drop all columns except sim_rating
lyricsim_df = lyricsim_df[['sim_rating']].copy()

In [30]:
lyricsim_df.rename(columns={'sim_rating': 'ratings'}, inplace=True)
lyricsim_df['source'] = 'lyricsim'
chart_dfs = pd.concat([chart_dfs, lyricsim_df])

In [35]:
alt.Chart(chart_dfs,
    width=480,
    height=320
).transform_density(
    'ratings',
    groupby=['source'],
    as_=['ratings', 'density'],
    extent=[0,5],
    bandwidth=0.5,
).mark_line(strokeWidth=1.5).encode(
    x="ratings:Q",
    y='density:Q',
    color='source:N'
)

## Length

In [55]:
# Read all txt files in ../data/raw/sts/input/
STS_INPUT_PATH = '../data/raw/sts/input/'
length_results = []
for input_file in os.listdir(STS_INPUT_PATH):
    if input_file.endswith('txt'):
        # Get all lines in the file
        fd = open(STS_INPUT_PATH + input_file, 'r')
        lines = fd.readlines()
        fd.close()
        lengths = []
        for line in lines:
            # Split the line into the two sentences
            sentences = line.split('\t')
            # Count words on each sentence
            if len(sentences) < 2:
                print(f'Error in {input_file}: {line}')
                continue
            lengths.append(len(sentences[0].split(" "))) 
            lengths.append(len(sentences[1].split(" ")))
        length_results.append({'pairs': len(lines), 'source': input_file, 'avg_length': round(np.mean(lengths), 2), 'stdev': round(np.std(lengths), 2)})   

Error in STS.input.belief.txt: yeah, we love the troops the way michael vick loves dogs.   maher: america loves its troops 'the way michael vick loves dogs'

Error in STS.input.belief.txt: yeah, we love the troops the way michael vick loves dogs. maher: america loves its troops 'the way michael vick loves dogs' | newsbusters.org

Error in STS.input.belief.txt: wtf are you babbling about? wtf is that all about?



In [56]:
for res in length_results:
    print(res)

{'pairs': 1500, 'source': 'STS.input.headlines.txt', 'avg_length': 7.5, 'stdev': 2.24}
{'pairs': 1500, 'source': 'STS.input.answers-students.txt', 'avg_length': 10.44, 'stdev': 3.34}
{'pairs': 2000, 'source': 'STS.input.belief.txt', 'avg_length': 13.01, 'stdev': 6.83}
{'pairs': 2000, 'source': 'STS.input.answers-forums.txt', 'avg_length': 15.04, 'stdev': 3.28}
{'pairs': 1500, 'source': 'STS.input.images.txt', 'avg_length': 9.6, 'stdev': 3.04}
