In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from google.colab import files
import re

## Data Analysis
---

In [2]:
# load processed csv
uploaded = files.upload()

# Read file
filtered_df = pd.read_csv('subjective_process.csv', sep=",")
filtered_df

Saving subjective_process.csv to subjective_process.csv


Unnamed: 0,age,english_L1,education,field,understanding,Lecture,readability,accuracy,useful,preference
0,25-34,Yes,PHD,Materials Science,Understood everything,Humanities,LLM,Human,Human,Human
1,35-44,Yes,"Bachelors, or similar",Mechanical Engineering,Understood everything,Humanities,LLM,LLM,LLM,LLM
2,25-34,Yes,Masters or similar,Visual Communications,Understood the general idea,Humanities,LLM,Human,LLM,LLM
3,25-34,Yes,PHD,Materials Science,Understood everything,Mathematics,Human,LLM,Human,Human
4,35-44,Yes,"Bachelors, or similar",Mechanical Engineering,Understood most points,Mathematics,Human,LLM,Human,Human
5,25-34,Yes,Masters or similar,Visual Communications,Understood most points,Mathematics,LLM,Human,LLM,LLM
6,25-34,Yes,"Bachelors, or similar",Business management,Understood most points,Humanities,LLM,LLM,LLM,LLM
7,18-24,Yes,"Bachelors, or similar",Commerce,Understood most points,Humanities,LLM,LLM,LLM,LLM
8,18-24,No,"Bachelors, or similar",biomedical science,Understood most points,Humanities,LLM,LLM,LLM,LLM
9,25-34,Yes,"Bachelors, or similar",Physics,Understood everything,Humanities,LLM,Human,Human,LLM


In [3]:
# Stats computation
###################

# Count variables
var_counts = ['age', 'english_L1', 'education', 'field', 'readability','accuracy','useful','preference']

for var_c in var_counts:
  counts = filtered_df[var_c].value_counts()
  total = counts.sum()
  percentages = (counts / total) * 100
  print(counts)


age
25-34    22
55-64    10
65+       8
18-24     6
35-44     2
45-54     2
Name: count, dtype: int64
english_L1
Yes    44
No      6
Name: count, dtype: int64
education
Bachelors, or similar    28
Masters or similar       12
Apprenticeship            4
Secondary school          4
PHD                       2
Name: count, dtype: int64
field
Materials Science                                                   2
Mechanical Engineering                                              2
Visual Communications                                               2
Business management                                                 2
Commerce                                                            2
biomedical science                                                  2
Physics                                                             2
master in business administration                                   2
Human Resources                                                     2
International Peace building,

In [4]:
import statsmodels.api as sm
from scipy.stats import chi2_contingency

# Generate stats table
measures = ['readability', 'accuracy', 'useful', 'preference', 'understanding']
cohorts = ['age', 'english_L1', 'education', 'Lecture', 'understanding', 'readability', 'accuracy', 'useful', 'preference']

results_cohorts = []#list()
results_all = []

for cohort in cohorts:
  for measure in measures:
    contingency_table = pd.crosstab(filtered_df[cohort], filtered_df[measure])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    results_cohorts = results_cohorts + [chi2, p, dof]
  results_all = results_all + [results_cohorts]
  results_cohorts = []


subjective_stats = pd.DataFrame(results_all)
subjective_stats

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,4.850074,0.4344502,5,3.535354,0.6180451,5,3.019781,0.696936,5,3.884079,0.5662237,5,10.616926,0.7792471,15
1,0.0,1.0,1,0.639205,0.423999,1,0.358073,0.5495784,1,0.030438,0.8614994,1,0.981858,0.8056417,3
2,8.652459,0.07039833,4,2.728175,0.6042935,4,8.664021,0.07006848,4,11.734694,0.01943715,4,22.941729,0.02822473,12
3,9.030868,0.01093886,2,4.256854,0.1190244,2,1.862374,0.3940857,2,5.067254,0.07937061,2,8.225773,0.2220249,6
4,15.949986,0.001161074,3,5.482456,0.1396917,3,3.536184,0.3160994,3,5.187448,0.1585747,3,150.0,8.81963e-28,9
5,44.342293,2.756952e-11,1,0.587607,0.443346,1,10.426582,0.001242144,1,16.983248,3.771108e-05,1,15.949986,0.001161074,3
6,0.587607,0.443346,1,45.920139,1.231732e-11,1,10.159867,0.00143531,1,9.924769,0.001630694,1,5.482456,0.1396917,3
7,10.426582,0.001242144,1,10.159867,0.00143531,1,45.753912,1.340813e-11,1,30.817522,2.834667e-08,1,3.536184,0.3160994,3
8,16.983248,3.771108e-05,1,9.924769,0.001630694,1,30.817522,2.834667e-08,1,45.162706,1.813254e-11,1,5.187448,0.1585747,3
