In [1]:
import pandas as pd

In [2]:
path = '/home/anna/Documents/Diploma/code/new_cleaned'

In [3]:
topic_df_statistics = pd.read_csv(f'{path}/topics_all_statistics_and_scores.csv')

In [4]:
topic_df_statistics = topic_df_statistics.drop(topic_df_statistics[topic_df_statistics.last_3_month_completed_step_users_count< 20].index)

In [5]:
topic_df_statistics.head()

Unnamed: 0,step_id,is_theory,text,seconds_to_complete,last_3_month_completion_rate,last_3_month_completed_step_users_count,last_3_month_avg_like,last_3_month_likes_count,last_3_month_topic_completion_rate,last_3_month_completed_topic_users_count,...,cleaned_texts,num_headings,symbols_in_snippets,num_words,num_sentences,num_syllables,norm_seconds,ASL,flesch_score,dale_chall_score
0,12357,1,"""<h5 id=\""introduction\""> Introduction</h5>\n\...",603.45,0.56,40.0,2.0,10.0,0.9,35.0,...,""" *heading* So far you have learned quite a lo...",6,790,618,24,995,9.764563,25.75,44.490012,9.180577
1,12691,1,"""<p>JavaScript was originally developed as a l...",253.21,0.87,884.0,1.72,79.0,0.96,844.0,...,"""JavaScript was originally developed as a lang...",5,35,605,35,962,4.185289,17.285714,54.769008,9.8964
2,8112,1,"""<p>We've already learned what annotations are...",486.94,0.45,50.0,1.57,7.0,0.89,49.0,...,"""We've already learned what annotations are an...",3,1506,610,30,990,7.982623,20.333333,48.895027,9.097296
3,7892,1,"""<h5>Introduction</h5>\n\n<p>You often hear pe...",542.73,0.72,60.0,2.0,5.0,0.91,58.0,...,""" *heading* You often hear people speak about ...",5,0,1079,51,1652,5.029935,21.156863,55.834186,9.207762
4,15809,1,"""<h5>Introduction</h5>\n\n<p>When you have alr...",556.76,0.43,43.0,1.4,10.0,0.91,53.0,...,""" *heading* When you have already learned the ...",5,409,633,21,995,8.795577,30.142857,43.258957,9.721412


In [6]:
topic_df_statistics.shape

(1064, 22)

In [7]:
topic_df_statistics = topic_df_statistics.dropna()

In [8]:
topic_df_statistics.shape  # there were no Nan values

(1064, 22)

In [16]:
# compute average number of sentences and words in topics
only_two = topic_df_statistics[['num_sentences', 'num_words']]

overall = only_two.shape[0]
sents = 0
words = 0

for i, row in only_two.iterrows():
    sents += row['num_sentences']
    words += row['num_words']
    
print('sents', sents, round(sents/overall, 2))
print('words', words, round(words/overall, 2))

sents 34773 32.68
words 790823 743.25


In [13]:
for i, row in only_two.iterrows():
    print(row['num_sentences'])
    break

24


In [11]:
only_two.shape

(1064, 2)

## Compute correlations

Correlations between Flesch score / Dale-Chall score and:

* normalized seconds_to_complete
* last_3_month_completion_rate
* last_3_month_avg_like (but only for those where last_year_likes_count is > 20)
* last_3_month_topic_completion_rate
* back_to_theory_times_per_user_session_avg_last_3_month
* back_to_theory_users_%_last_3_month

In [7]:
from scipy.stats import spearmanr

In [14]:
def correlate_two(df, name_row1, name_row2):    
    score, pvalue = spearmanr(a=df[name_row1].values, b=df[name_row2].values, nan_policy='raise')
    return score, pvalue

In [16]:
print('FLESCH')
for column in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
              'back_to_theory_times_per_user_session_avg_last_3_month', 'back_to_theory_users_%_last_3_month']:
    print(column)
    
    score, p = correlate_two(topic_df_statistics, 'flesch_score', column)
    print(round(score, 3), round(p, 5))
    print()

FLESCH
norm_seconds
-0.186 0.0

last_3_month_completion_rate
0.099 0.00119

last_3_month_topic_completion_rate
0.025 0.4138

back_to_theory_times_per_user_session_avg_last_3_month
-0.096 0.00176

back_to_theory_users_%_last_3_month
-0.116 0.00014



In [17]:
# for last_year_avg_like we need to take rows where last_year_likes_count > 20
df_with_likes = topic_df_statistics.drop(topic_df_statistics[topic_df_statistics.last_3_month_likes_count < 20].index)

score, p = correlate_two(df_with_likes, 'flesch_score', 'last_3_month_avg_like')
print(round(score, 3), round(p, 5))

0.009 0.82518


In [18]:
print('DALE-CHALL')
for column in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
              'back_to_theory_times_per_user_session_avg_last_3_month', 'back_to_theory_users_%_last_3_month']:
    print(column)
    
    score, p = correlate_two(topic_df_statistics, 'dale_chall_score', column)
    print(round(score, 3), round(p, 5))
    print()

DALE-CHALL
norm_seconds
0.177 0.0

last_3_month_completion_rate
-0.074 0.01601

last_3_month_topic_completion_rate
-0.009 0.77102

back_to_theory_times_per_user_session_avg_last_3_month
0.087 0.00438

back_to_theory_users_%_last_3_month
0.088 0.00419



In [19]:
score, p = correlate_two(df_with_likes, 'dale_chall_score', 'last_3_month_avg_like')
print(round(score, 3), round(p, 5))

-0.104 0.01005


In [22]:
# the scores between each other
score, p = correlate_two(topic_df_statistics, 'dale_chall_score', 'flesch_score')
print(round(score, 3), round(p, 5))

-0.666 0.0


In [23]:
# and now -- between all the parameters with each other
for column1 in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
               'last_3_month_avg_like', 'back_to_theory_times_per_user_session_avg_last_3_month', 
               'back_to_theory_users_%_last_3_month']:
    for column2 in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
                    'last_3_month_avg_like', 'back_to_theory_times_per_user_session_avg_last_3_month',
                    'back_to_theory_users_%_last_3_month']:
        if column1 == column2:
            continue
        score, p = correlate_two(df_with_likes, column1, column2)
        print(column1, column2)
        print(round(score, 3), round(p, 5))

norm_seconds last_3_month_completion_rate
-0.494 0.0
norm_seconds last_3_month_topic_completion_rate
0.012 0.77614
norm_seconds last_3_month_avg_like
-0.042 0.29742
norm_seconds back_to_theory_times_per_user_session_avg_last_3_month
0.687 0.0
norm_seconds back_to_theory_users_%_last_3_month
0.669 0.0
last_3_month_completion_rate norm_seconds
-0.494 0.0
last_3_month_completion_rate last_3_month_topic_completion_rate
-0.101 0.01251
last_3_month_completion_rate last_3_month_avg_like
-0.023 0.57693
last_3_month_completion_rate back_to_theory_times_per_user_session_avg_last_3_month
-0.463 0.0
last_3_month_completion_rate back_to_theory_users_%_last_3_month
-0.51 0.0
last_3_month_topic_completion_rate norm_seconds
0.012 0.77614
last_3_month_topic_completion_rate last_3_month_completion_rate
-0.101 0.01251
last_3_month_topic_completion_rate last_3_month_avg_like
-0.002 0.95814
last_3_month_topic_completion_rate back_to_theory_times_per_user_session_avg_last_3_month
0.005 0.90796
last_3_month_

## Correlations with other readability scores

In [30]:
path = '/home/anna/Documents/Diploma/code/new_cleaned'

In [31]:
df_with_all_scores = pd.read_csv(f'{path}/topics_20_compl_statistics_and_MORE_scores.csv')

In [33]:
for score in ['flesch_kincaid', 'gunning_fog', 'coleman_liau', 'ari', 'linsear_write', 'spache']:
    print('###', score, '###')
    for column in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
                   'last_3_month_avg_like', 'back_to_theory_times_per_user_session_avg_last_3_month', 
                   'back_to_theory_users_%_last_3_month']:
        print(column)

        index, p = correlate_two(df_with_all_scores, f'{score}_score', column)
        print(round(index, 3), round(p, 5))
        print()
    print('##########')

### flesch_kincaid ###
norm_seconds
0.167 0.0

last_3_month_completion_rate
-0.061 0.04502

last_3_month_topic_completion_rate
-0.017 0.5832

last_3_month_avg_like
0.021 0.48733

back_to_theory_times_per_user_session_avg_last_3_month
0.128 3e-05

back_to_theory_users_%_last_3_month
0.135 1e-05

##########
### gunning_fog ###
norm_seconds
0.123 6e-05

last_3_month_completion_rate
-0.024 0.44312

last_3_month_topic_completion_rate
-0.04 0.19355

last_3_month_avg_like
0.028 0.36409

back_to_theory_times_per_user_session_avg_last_3_month
0.08 0.0088

back_to_theory_users_%_last_3_month
0.077 0.01183

##########
### coleman_liau ###
norm_seconds
0.198 0.0

last_3_month_completion_rate
-0.158 0.0

last_3_month_topic_completion_rate
-0.011 0.71649

last_3_month_avg_like
0.007 0.81527

back_to_theory_times_per_user_session_avg_last_3_month
0.046 0.13514

back_to_theory_users_%_last_3_month
0.083 0.00655

##########
### ari ###
norm_seconds
0.178 0.0

last_3_month_completion_rate
-0.075 0.01466

In [35]:
# for smog we need to disregard those values where there are zeros
df_topics_for_smog = df_with_all_scores.drop(df_with_all_scores[df_with_all_scores.smog_score == 0].index)

In [36]:
for score in ['smog']:
    print(score)
    for column in ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_topic_completion_rate', 
                   'last_3_month_avg_like', 'back_to_theory_times_per_user_session_avg_last_3_month', 
                   'back_to_theory_users_%_last_3_month']:
        print(column)

        index, p = correlate_two(df_topics_for_smog, f'{score}_score', column)
        print(round(index, 3), round(p, 5))
        print()
    print('##########')

smog
norm_seconds
0.141 0.00066

last_3_month_completion_rate
-0.121 0.00362

last_3_month_topic_completion_rate
-0.118 0.00436

last_3_month_avg_like
0.028 0.49847

back_to_theory_times_per_user_session_avg_last_3_month
0.101 0.01521

back_to_theory_users_%_last_3_month
0.103 0.0134

##########
