We will prepare features for all topics, disregarding the 'target' features. Later from the DFs with features we'll take only those topics that build our training corpus.

## Take features from Quita Up

In [2]:
import pandas as pd

In [9]:
quita_features_file = './new_cleaned/all_quita_features_all_texts.csv'

In [10]:
quita_features = pd.read_csv(quita_features_file)
quita_features.head()

# from this we don't need 'VarEntropy_lemma', 'descriptivity' 
# 'Text' is step id

Unnamed: 0,Text,ATL_token,Entropy_lemma,VarEntropy_lemma,hpoint_lemma,mattr_lemma_100,mattr_lemma_25,mattr_lemma_50,TC_lemma,activity,descriptivity,verbdistance_lemma,zttr_FIClemma,topic_id
0,/10006,3.887262,6.815966,0.005011,12.0,0.603356,0.850804,0.729842,0.034531,0.728,0.272,6.5,-0.548987,905
1,/10014,4.051695,7.080815,0.003463,15.2,0.589944,0.851419,0.728612,0.027688,0.661972,0.338028,7.328571,-0.619795,906
2,/10022,4.363095,7.413,0.004039,13.5,0.656161,0.86622,0.762795,0.033896,0.537634,0.462366,8.878788,0.010507,909
3,/10027,4.055164,6.981621,0.003852,12.666667,0.62992,0.863382,0.754446,0.0,0.709677,0.290323,6.678899,-0.504511,910
4,/10035,3.815013,6.930712,0.003942,15.5,0.611667,0.845114,0.741551,0.043239,0.647368,0.352632,8.065574,-0.627875,911


#### See correlations between the features

In [11]:
from scipy import stats

In [12]:
# see if the 3 mattr attributes correlate
coef1, pval1 = stats.spearmanr(quita_features['mattr_lemma_100'], quita_features['mattr_lemma_50'])
coef2, pval2 = stats.spearmanr(quita_features['mattr_lemma_100'], quita_features['mattr_lemma_25'])
coef3, pval3 = stats.spearmanr(quita_features['mattr_lemma_50'], quita_features['mattr_lemma_25'])

print(coef1, pval1)
print(coef2, pval2)
print(coef3, pval3)

# they do, so we'll take only one of them (50)

0.9571951192133077 0.0
0.8484480688531248 0.0
0.9408634476661891 0.0


In [13]:
list_taking = ['hpoint_lemma', 'Entropy_lemma', 'verbdistance_lemma', 'activity', 'ATL_token', 'TC_lemma', 
               'mattr_lemma_50', 'zttr_FIClemma']

for i, attribute1 in enumerate(list_taking):
    for attribute2 in list_taking[i+1:]:
        coef, pval = stats.spearmanr(quita_features[attribute1], quita_features[attribute2])
        
        if abs(coef) > 0.5 and pval < 0.05: 
            print(attribute1, attribute2)
            print(round(coef, 2), round(pval, 2))
            print()

Entropy_lemma mattr_lemma_50
0.6 0.0

Entropy_lemma zttr_FIClemma
0.78 0.0

verbdistance_lemma activity
-0.67 0.0

mattr_lemma_50 zttr_FIClemma
0.73 0.0



zttr_FIClemma correlates a lot with Entropy_lemma and mattr, so we'll not take it among the attributes

#### Create a DF with quita features for topic IDs

In [21]:
only_needed_features = ['topic_id', 'hpoint_lemma', 'Entropy_lemma', 'verbdistance_lemma', 'activity', 
                        'ATL_token', 'TC_lemma', 'mattr_lemma_50']

In [27]:
quita_needed_df = quita_features[only_needed_features]
quita_needed_df.head()

Unnamed: 0,topic_id,hpoint_lemma,Entropy_lemma,verbdistance_lemma,activity,ATL_token,TC_lemma,mattr_lemma_50
0,905,12.0,6.815966,6.5,0.728,3.887262,0.034531,0.729842
1,906,15.2,7.080815,7.328571,0.661972,4.051695,0.027688,0.728612
2,909,13.5,7.413,8.878788,0.537634,4.363095,0.033896,0.762795
3,910,12.666667,6.981621,6.678899,0.709677,4.055164,0.0,0.754446
4,911,15.5,6.930712,8.065574,0.647368,3.815013,0.043239,0.741551


In [29]:
quita_needed_df.to_csv('./new_cleaned/train_corpus/quita_features.csv', index=False)

### Collect other linguistic features

In [21]:
# DC score
# Flesch score
# average sentence length

In [30]:
import pandas as pd

In [31]:
import pickle

In [32]:
topic_id_step_id_mapping = pickle.load(open('topic_id_step_id_mapping.pkl', 'br'))

In [39]:
step_id_topic_id_mapping = pickle.load(open('step_id_topic_id_mapping.pkl', 'br'))

In [33]:
topics_statistics = pd.read_csv('./new_cleaned/topics_all_statistics_and_scores.csv')

In [34]:
topics_statistics.head()

Unnamed: 0,step_id,is_theory,text,seconds_to_complete,last_3_month_completion_rate,last_3_month_completed_step_users_count,last_3_month_avg_like,last_3_month_likes_count,last_3_month_topic_completion_rate,last_3_month_completed_topic_users_count,...,cleaned_texts,num_headings,symbols_in_snippets,num_words,num_sentences,num_syllables,norm_seconds,ASL,flesch_score,dale_chall_score
0,12357,1,"""<h5 id=\""introduction\""> Introduction</h5>\n\...",603.45,0.56,40.0,2.0,10.0,0.9,35.0,...,""" *heading* So far you have learned quite a lo...",6,790,618,24,995,9.764563,25.75,44.490012,9.180577
1,12691,1,"""<p>JavaScript was originally developed as a l...",253.21,0.87,884.0,1.72,79.0,0.96,844.0,...,"""JavaScript was originally developed as a lang...",5,35,605,35,962,4.185289,17.285714,54.769008,9.8964
2,8112,1,"""<p>We've already learned what annotations are...",486.94,0.45,50.0,1.57,7.0,0.89,49.0,...,"""We've already learned what annotations are an...",3,1506,610,30,990,7.982623,20.333333,48.895027,9.097296
3,7892,1,"""<h5>Introduction</h5>\n\n<p>You often hear pe...",542.73,0.72,60.0,2.0,5.0,0.91,58.0,...,""" *heading* You often hear people speak about ...",5,0,1079,51,1652,5.029935,21.156863,55.834186,9.207762
4,15809,1,"""<h5>Introduction</h5>\n\n<p>When you have alr...",556.76,0.43,43.0,1.4,10.0,0.91,53.0,...,""" *heading* When you have already learned the ...",5,409,633,21,995,8.795577,30.142857,43.258957,9.721412


In [35]:
ling_features_df = quita_needed_df.copy()

In [40]:
# in ling features there's now one row more than in topic statistics; we need to delete this row

print(ling_features_df.shape[0])

bad_step_id = 16324
bad_topic_id = step_id_topic_id_mapping[bad_step_id]
ling_features_df = ling_features_df.drop(ling_features_df[ling_features_df.topic_id == bad_topic_id].index)
print(ling_features_df.shape[0])

1319
1318


In [41]:
for attr in ['dale_chall_score', 'flesch_score', 'ASL']:
    attr_list = []

    for row in ling_features_df.iterrows():
        topic_id = row[1].topic_id
        step_id = topic_id_step_id_mapping[topic_id]

        corresponding_row = topics_statistics[topics_statistics.step_id == step_id]
        assert not corresponding_row.empty, f'In topic statistics there\'s no row with step_id {step_id}'

        attr_value = corresponding_row[attr].values[0]
        attr_list.append(attr_value)

    ling_features_df[attr] = attr_list

In [42]:
ling_features_df.head()

Unnamed: 0,topic_id,hpoint_lemma,Entropy_lemma,verbdistance_lemma,activity,ATL_token,TC_lemma,mattr_lemma_50,dale_chall_score,flesch_score,ASL
0,905,12.0,6.815966,6.5,0.728,3.887262,0.034531,0.729842,8.658398,63.733344,22.346154
1,906,15.2,7.080815,7.328571,0.661972,4.051695,0.027688,0.728612,9.531336,38.968934,29.84375
2,909,13.5,7.413,8.878788,0.537634,4.363095,0.033896,0.762795,9.591177,52.834292,16.62
3,910,12.666667,6.981621,6.678899,0.709677,4.055164,0.0,0.754446,8.616742,58.471407,23.375
4,911,15.5,6.930712,8.065574,0.647368,3.815013,0.043239,0.741551,8.801582,52.769228,31.6


In [43]:
ling_features_df.to_csv('./new_cleaned/train_corpus/all_ling_features.csv', index=False)

### Add probability of the text from GPT-2

#### Try with the first example

In [29]:
#!pip3 install lm-scorer

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import torch
from lm_scorer.models.auto import AutoLMScorer as LMScorer

In [2]:
import pandas as pd

In [3]:
list(LMScorer.supported_model_names())

['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'distilgpt2']

In [4]:
# Load model to cpu or cuda
available_gpu = torch.cuda.is_available()
print(available_gpu)
device = "cuda:0" if available_gpu else "cpu"

False


  return torch._C._cuda_getDeviceCount() > 0


In [28]:
#!pip3 install ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Downloading ipywidgets-7.7.1-py2.py3-none-any.whl (123 kB)
     |████████████████████████████████| 123 kB 1.6 MB/s            
Collecting jupyterlab-widgets>=1.0.0
  Downloading jupyterlab_widgets-1.1.1-py3-none-any.whl (245 kB)
     |████████████████████████████████| 245 kB 1.8 MB/s            
[?25hCollecting widgetsnbextension~=3.6.0
  Downloading widgetsnbextension-3.6.1-py2.py3-none-any.whl (1.6 MB)
     |████████████████████████████████| 1.6 MB 1.5 MB/s            


Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-7.7.1 jupyterlab-widgets-1.1.1 widgetsnbextension-3.6.1


In [5]:
batch_size = 1
scorer = LMScorer.from_pretrained("gpt2", device=device, batch_size=batch_size)

In [6]:
# Return token probabilities
scorer.tokens_score("I like this package.")  # the higher, the more probable

([0.018320949748158455,
  0.006643158383667469,
  0.08063255995512009,
  0.0006074582342989743,
  0.27771326899528503,
  0.003638095688074827],
 [40, 588, 428, 5301, 13, 50256],
 ['I', 'Ġlike', 'Ġthis', 'Ġpackage', '.', '<|endoftext|>'])

In [7]:
scorer.tokens_score("I like this package.", log=True)  # the higher, the more probable

([-3.9997100830078125,
  -5.014167785644531,
  -2.517852783203125,
  -7.406227111816406,
  -1.2811660766601562,
  -5.616294860839844],
 [40, 588, 428, 5301, 13, 50256],
 ['I', 'Ġlike', 'Ġthis', 'Ġpackage', '.', '<|endoftext|>'])

In [None]:
# we don't want to use multiplication of tokens' probabilities
# cause 1) results will be very small numbers, 2) it will be influences by the number of tokens

In [8]:
# Compute sentence score as the geometric mean of tokens' probabilities
print(scorer.sentence_score("I like this package.", reduce="gmean"))

# Get the log of the sentence score
print(scorer.sentence_score("I like this package.", log=True))

# Score multiple sentences
print(scorer.sentence_score(["Sentence 1", "Sentence 2"]))
print(scorer.sentence_score(["What a wonderful day", "What a wonderful life"], reduce="gmean"))

0.013488700613379478
-25.835418701171875
[1.1507714052505502e-11, 5.66448667485564e-12]
[0.00517506618052721, 0.0036758694332093]


In [9]:
print(scorer.sentence_score(["Sentence 1", "Sentence 2"], reduce='gmean', log=True))

[-6.297000885009766, -6.474201202392578]


#### Get scores for sentences in each topic

In [20]:
import pickle

In [10]:
import spacy
en_sm_model = spacy.load("en_core_web_sm")

In [11]:
raw_text = 'Hello, world. Here are two sentences.'
doc = en_sm_model(raw_text)

In [12]:
sentences = [sent.text.strip() for sent in doc.sents]
sentences

['Hello, world.', 'Here are two sentences.']

In [13]:
def get_spacy_sents(text, model):
    doc = model(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [None]:
# since we score sentences, we need to get sentences for each topic
# for now we have cleaned text and lemmatized text
# we'll need to go through cleaned texts again, split them into sentences and receive sentence scores

In [14]:
topics_statistics = pd.read_csv('./new_cleaned/topics_all_statistics_and_scores.csv')

In [15]:
topics_statistics.head()

Unnamed: 0,step_id,is_theory,text,seconds_to_complete,last_3_month_completion_rate,last_3_month_completed_step_users_count,last_3_month_avg_like,last_3_month_likes_count,last_3_month_topic_completion_rate,last_3_month_completed_topic_users_count,...,cleaned_texts,num_headings,symbols_in_snippets,num_words,num_sentences,num_syllables,norm_seconds,ASL,flesch_score,dale_chall_score
0,12357,1,"""<h5 id=\""introduction\""> Introduction</h5>\n\...",603.45,0.56,40.0,2.0,10.0,0.9,35.0,...,""" *heading* So far you have learned quite a lo...",6,790,618,24,995,9.764563,25.75,44.490012,9.180577
1,12691,1,"""<p>JavaScript was originally developed as a l...",253.21,0.87,884.0,1.72,79.0,0.96,844.0,...,"""JavaScript was originally developed as a lang...",5,35,605,35,962,4.185289,17.285714,54.769008,9.8964
2,8112,1,"""<p>We've already learned what annotations are...",486.94,0.45,50.0,1.57,7.0,0.89,49.0,...,"""We've already learned what annotations are an...",3,1506,610,30,990,7.982623,20.333333,48.895027,9.097296
3,7892,1,"""<h5>Introduction</h5>\n\n<p>You often hear pe...",542.73,0.72,60.0,2.0,5.0,0.91,58.0,...,""" *heading* You often hear people speak about ...",5,0,1079,51,1652,5.029935,21.156863,55.834186,9.207762
4,15809,1,"""<h5>Introduction</h5>\n\n<p>When you have alr...",556.76,0.43,43.0,1.4,10.0,0.91,53.0,...,""" *heading* When you have already learned the ...",5,409,633,21,995,8.795577,30.142857,43.258957,9.721412


In [16]:
topics_statistics.shape

(1318, 22)

In [17]:
def delete_special_tokens(text):
    for tok in ['*heading*', '*img*', '*code*', '*math*', '*table*']:
        text = text.replace(tok, '')
    return text

In [18]:
# for each topic (step id) we get a list of prbabilities of its sentences
step_id_sent_prob_mapping = dict()

for i, row in topics_statistics.iterrows():
    print(i)
    cleaned = row.cleaned_texts
    cleaned = delete_special_tokens(cleaned)
    
    # split into sentences
    sentences = get_spacy_sents(cleaned, en_sm_model)
    
    # compute probability of sentences
    sent_probs = scorer.sentence_score(sentences, reduce="gmean")
    
    step_id_sent_prob_mapping[row.step_id] = sent_probs

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [22]:
step_id_sent_prob_mapping[12691]

[0.01088960561901331,
 0.04942912235856056,
 0.013954082503914833,
 0.03850657492876053,
 0.021659985184669495,
 0.04389766976237297,
 0.010407773777842522,
 0.027493784204125404,
 0.02199213206768036,
 0.042614661157131195,
 0.038606010377407074,
 0.04772435873746872,
 0.010390409268438816,
 0.012639411725103855,
 0.013896314427256584,
 0.04091031849384308,
 0.0265134759247303,
 0.06715111434459686,
 0.035362791270017624,
 0.010581093840301037,
 0.02318025752902031,
 0.009649845771491528,
 0.017345214262604713,
 0.008305211551487446,
 0.06712324917316437,
 0.029575007036328316,
 0.033525500446558,
 0.024693557992577553,
 0.021297771483659744,
 0.01655518263578415,
 0.017705131322145462,
 0.007748128846287727,
 0.04461367800831795,
 0.043341416865587234,
 0.0458516851067543,
 0.0178876630961895,
 0.02358844503760338]

In [21]:
pickle.dump(step_id_sent_prob_mapping, open('./new_cleaned/train_corpus/step_id_sent_prob_mapping.pkl', 'wb'))

### From sentence scores, get a score for each topic

In [52]:
import statistics

In [None]:
# now we prepare to write them into a DF

In [46]:
df_with_topic_id = pd.read_csv('./new_cleaned/train_corpus/all_ling_features.csv')
topic_ids = df_with_topic_id.topic_id.values
print(len(topic_ids))

1318


In [47]:
topic_id_step_id_map = pickle.load(open('./topic_id_step_id_mapping.pkl', 'rb'))

In [50]:
step_id_sent_prob_mapping = pickle.load(open('./new_cleaned/train_corpus/step_id_sent_prob_mapping.pkl', 'rb'))

In [56]:
def get_topic_score_from_sentence_scores(topic_id, topic_id_step_id_mapping, step_id_sent_prob_mapping):
    corr_step_id = topic_id_step_id_mapping[topic_id]
    list_sent_scores = step_id_sent_prob_mapping[corr_step_id]
    return statistics.mean(list_sent_scores)

In [57]:
LM_scores = []
for topic_id in topic_ids:
    LM_scores.append(get_topic_score_from_sentence_scores(topic_id, topic_id_step_id_mapping, step_id_sent_prob_mapping))

In [58]:
lm_prob_df = pd.DataFrame({'topic_id': topic_ids, 'GPT2_prob': LM_scores})

In [59]:
lm_prob_df.head()

Unnamed: 0,topic_id,GPT2_prob
0,905,0.018858
1,906,0.020324
2,909,0.016106
3,910,0.022494
4,911,0.018551


In [61]:
lm_prob_df.to_csv('./new_cleaned/train_corpus/LM_feature.csv', index=False)

## Collect meta-features

### Preparations

In [55]:
# the number of overall prereqs - from topic_id_all_prereq_mapping.pkl
# the number of direct prereqs - from topic_id_direct_prereq_mapping.pkl
# the number of code snippets - the number of *code*

# the number of images - need to replace images to *img* in text extraction
# the number of sections - need to count headings in text extraction

# percent of code in the whole text (in symbols)
# the topic group -- from the comments' dump -- DO NOT TAKE

In [1]:
import pickle

In [2]:
import pandas as pd

In [3]:
topic_id_all_prereq_mapping = pickle.load(open('topic_id_all_prereq_mapping.pkl', 'rb'))
topic_id_direct_prereq_mapping = pickle.load(open('topic_id_direct_prereq_mapping.pkl', 'rb'))
step_id_topic_id_mapping = pickle.load(open('step_id_topic_id_mapping.pkl', 'br'))
topic_id_step_id_mapping = pickle.load(open('topic_id_step_id_mapping.pkl', 'br'))

topics_statistics = pd.read_csv('./new_cleaned/topics_all_statistics_and_scores.csv')

In [4]:
topics_statistics.columns

Index(['step_id', 'is_theory', 'text', 'seconds_to_complete',
       'last_3_month_completion_rate',
       'last_3_month_completed_step_users_count', 'last_3_month_avg_like',
       'last_3_month_likes_count', 'last_3_month_topic_completion_rate',
       'last_3_month_completed_topic_users_count',
       'back_to_theory_times_per_user_session_avg_last_3_month',
       'back_to_theory_users_%_last_3_month', 'cleaned_texts', 'num_headings',
       'symbols_in_snippets', 'num_words', 'num_sentences', 'num_syllables',
       'norm_seconds', 'ASL', 'flesch_score', 'dale_chall_score'],
      dtype='object')

In [5]:
def get_overall_prereqs(topic_id, topic_id_all_prereq_mapping):
    return len(topic_id_all_prereq_mapping[topic_id])

In [6]:
def get_direct_prereqs(topic_id, topic_id_direct_prereq_mapping):
    return len(topic_id_direct_prereq_mapping[topic_id])

In [7]:
def get_num_code_snippets(topic_id, topic_statistics_df, topic_id_step_id_mapping):
    step_id = topic_id_step_id_mapping[topic_id]
    row = topic_statistics_df[topic_statistics_df.step_id == step_id]
    text = row.cleaned_texts.values[0]
    return text.count("*code*")

In [8]:
def get_num_images(topic_id, topic_statistics_df, topic_id_step_id_mapping):
    step_id = topic_id_step_id_mapping[topic_id]
    row = topic_statistics_df[topic_statistics_df.step_id == step_id]
    text = row.cleaned_texts.values[0]
    return text.count("*img*")

In [9]:
def get_num_sections(topic_id, topic_statistics_df, topic_id_step_id_mapping):
    step_id = topic_id_step_id_mapping[topic_id]
    row = topic_statistics_df[topic_statistics_df.step_id == step_id]
    return row.num_headings.values[0]

In [10]:
import re
pattern = re.compile(r'\s+')

In [11]:
def get_percent_code(topic_id, topic_statistics_df, topic_id_step_id_mapping):  
    # percent of code in symbols, wo spaces
    step_id = topic_id_step_id_mapping[topic_id]
    row = topic_statistics_df[topic_statistics_df.step_id == step_id]
    text = row.cleaned_texts.values[0]
    wo_spaces = re.sub(pattern, '', text)
    
    snippets_len = row.symbols_in_snippets.values[0]
    return snippets_len / (len(wo_spaces) + snippets_len)

In [12]:
def collect_all_features(topic_id, topic_statistics_df, topic_id_step_id_mapping, 
                        topic_id_all_prereq_mapping, topic_id_direct_prereq_mapping):
    return (get_overall_prereqs(topic_id, topic_id_all_prereq_mapping),
            get_direct_prereqs(topic_id, topic_id_direct_prereq_mapping),
            get_num_code_snippets(topic_id, topic_statistics_df, topic_id_step_id_mapping),
            get_num_images(topic_id, topic_statistics_df, topic_id_step_id_mapping),
            get_num_sections(topic_id, topic_statistics_df, topic_id_step_id_mapping),
            get_percent_code(topic_id, topic_statistics_df, topic_id_step_id_mapping))

### Create a df with meta features

In [13]:
df_with_topic_id = pd.read_csv('./new_cleaned/train_corpus/all_ling_features.csv')
topic_ids = df_with_topic_id.topic_id.values
print(len(topic_ids))

1318


In [14]:
print(len(topic_id_direct_prereq_mapping), len(topic_id_all_prereq_mapping))

1316 1338


In [15]:
# check what topics are not in prereq mappings
ids_not_in_mapping = []

for t_id in topic_ids:
    if t_id not in topic_id_direct_prereq_mapping:
        print(t_id)
        ids_not_in_mapping.append(t_id)
        
print('now all prereqs')

for t_id in topic_ids:
    if t_id not in topic_id_all_prereq_mapping:
        print(t_id)

1032
1075
1114
1174
350
now all prereqs


In [16]:
# for the alg to work without mistakes, let's delete these topic IDs from topic_ids
topic_ids = list(topic_ids)
print(len(topic_ids))

for el in ids_not_in_mapping:
    topic_ids.remove(el)
    
print(len(topic_ids))

1318
1313


In [17]:
dict_features = dict()
features_list = ['overall_prereqs', 'direct_prereqs', 'num_snippets', 'num_img', 'num_sections', 'percent_code']

for feature in features_list:
    dict_features[feature] = []

In [18]:
for topic_id in topic_ids:
        six_values = collect_all_features(topic_id, topics_statistics, topic_id_step_id_mapping,
                                          topic_id_all_prereq_mapping, topic_id_direct_prereq_mapping)
        
        for key, value in zip(dict_features.keys(), six_values):
            dict_features[key].append(value)

In [19]:
df_meta_features = pd.DataFrame(columns=['topic_id'] + features_list)

In [20]:
df_meta_features['topic_id'] = topic_ids
for feature in features_list:
    df_meta_features[feature] = dict_features[feature]

In [21]:
df_meta_features.head()

Unnamed: 0,topic_id,overall_prereqs,direct_prereqs,num_snippets,num_img,num_sections,percent_code
0,905,53,1,5,1,5,0.175722
1,906,12,2,15,0,4,0.401552
2,909,18,3,4,0,4,0.013238
3,910,3,1,0,3,4,0.0
4,911,24,1,15,0,5,0.27063


In [22]:
df_meta_features.to_csv('./new_cleaned/train_corpus/meta_features.csv', index=False)

## Add statistical features

In [73]:
# we'll need to disregard average like if it was used to extend the corpus

In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
df_with_topic_id = pd.read_csv('./new_cleaned/train_corpus/all_ling_features.csv')
topic_ids = df_with_topic_id.topic_id.values
print(len(topic_ids))

1318


In [4]:
topics_statistics = pd.read_csv('./new_cleaned/topics_all_statistics_and_scores.csv')

In [5]:
print(len(topic_ids), topics_statistics.shape[0])

1318 1318


In [6]:
step_id_topic_id_mapping = pickle.load(open('step_id_topic_id_mapping.pkl', 'br'))

In [7]:
topics_statistics.columns

Index(['step_id', 'is_theory', 'text', 'seconds_to_complete',
       'last_3_month_completion_rate',
       'last_3_month_completed_step_users_count', 'last_3_month_avg_like',
       'last_3_month_likes_count', 'last_3_month_topic_completion_rate',
       'last_3_month_completed_topic_users_count',
       'back_to_theory_times_per_user_session_avg_last_3_month',
       'back_to_theory_users_%_last_3_month', 'cleaned_texts', 'num_headings',
       'symbols_in_snippets', 'num_words', 'num_sentences', 'num_syllables',
       'norm_seconds', 'ASL', 'flesch_score', 'dale_chall_score'],
      dtype='object')

In [8]:
# we take only one of the two back_to_theory cause they correlate a lot (0.97)
needed_features_w_like = ['norm_seconds', 'last_3_month_completion_rate', 'last_3_month_avg_like',
                          'last_3_month_topic_completion_rate', 'back_to_theory_users_%_last_3_month']
needed_features_wo_like = ['norm_seconds', 'last_3_month_completion_rate',
                          'last_3_month_topic_completion_rate', 'back_to_theory_users_%_last_3_month']

In [9]:
topics_statistics_with_completions = topics_statistics[topics_statistics.last_3_month_completed_step_users_count > 20]
topics_statistics_with_likes = topics_statistics[topics_statistics.last_3_month_likes_count > 20]

In [18]:
def fill_in_df(topic_statistics, needed_features, step_id_topic_id_mapping):
    dict_w_features = dict()
    for feature in ['topic_id'] + needed_features:
        dict_w_features[feature] = []
    
    for i, row in topic_statistics.iterrows():
        corr_topic_id = step_id_topic_id_mapping[row.step_id]

        dict_w_features['topic_id'].append(corr_topic_id)
        for feature in needed_features:
            dict_w_features[feature].append(row[feature])  # values[0]

    df_result = pd.DataFrame(dict_w_features)
    return df_result

In [19]:
df_with_likes = fill_in_df(topics_statistics_with_likes, needed_features_w_like, step_id_topic_id_mapping)
assert df_with_likes.shape[0] == topics_statistics_with_likes.shape[0]
df_with_likes.head()

Unnamed: 0,topic_id,norm_seconds,last_3_month_completion_rate,last_3_month_avg_like,last_3_month_topic_completion_rate,back_to_theory_users_%_last_3_month
0,1313,4.185289,0.87,1.72,0.96,24.43
1,260,1.955792,0.88,1.74,0.86,20.11
2,604,4.487571,0.83,1.67,0.9,31.5
3,215,3.803515,0.86,1.83,0.82,23.08
4,1286,10.688952,0.76,1.83,0.74,58.91


In [20]:
df_wo_likes = fill_in_df(topics_statistics_with_completions, needed_features_wo_like, step_id_topic_id_mapping)
assert df_wo_likes.shape[0] == topics_statistics_with_completions.shape[0]
df_wo_likes.head()

Unnamed: 0,topic_id,norm_seconds,last_3_month_completion_rate,last_3_month_topic_completion_rate,back_to_theory_users_%_last_3_month
0,1235,9.764563,0.56,0.9,67.21
1,1313,4.185289,0.87,0.96,24.43
2,629,7.982623,0.45,0.89,69.41
3,603,5.029935,0.72,0.91,26.39
4,1685,8.795577,0.43,0.91,50.88


In [21]:
df_with_likes.to_csv('./new_cleaned/train_corpus/statistics_with_likes.csv', index=False)
df_wo_likes.to_csv('./new_cleaned/train_corpus/statistics_without_likes.csv', index=False)