In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
dataset_name = 'stackoverflow'
filename = f'{dataset_name}_corpus.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
A = pd.read_excel(filepath, index_col=0)

In [4]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,truncate string net given string projects mult...,.net
1,1,net_windows_application automatically call pro...,.net
2,2,c_convert string unique id let assume_string_n...,.net
3,3,consolewriteline output webservice go consolew...,.net
4,4,net_applications_immune_classic_pointer_errors...,.net


In [5]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + str(doc.chunk_doc)

In [7]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [8]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [9]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [10]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

.net
net 0.21
using 0.19
like 0.17
get 0.16
cangrowtrue_cangrow 0.15

android
wrap_content 0.35
androidlayout_height 0.29
androidlayout_width 0.29
savedinstancestate 0.18
intent 0.17

angularjs
div 0.47
td_td0 0.4
angularjs 0.23
angular 0.17
td 0.15

asp.net
runat_server 0.42
id 0.35
div 0.24
itemtemplate 0.24
asptemplatefield 0.16

c
printf 0.45
int 0.31
struct 0.27
include 0.18
stdioh 0.16

c#
get 0.29
using 0.23
set 0.22
xmlelement_elementname 0.2
int 0.17

c++
cout 0.55
int 0.24
endl 0.21
include 0.21
stdbasic_ostream_elem 0.2

css
div 0.51
css 0.22
padding 0.19
ul 0.18
0px 0.15

html
div 0.45
td_tdvalue 0.35
lia_href 0.28
want 0.14
li 0.14

ios
ios 0.26
app 0.2
nsstring 0.19
get 0.17
want 0.16

iphone
iphone 0.25
want 0.18
like 0.17
app 0.12
get 0.12

java
systemoutprintln 0.23
public 0.23
public_void 0.2
static 0.18
int 0.17

javascript
_0x3eb4 0.45
var 0.32
javascript 0.2
function 0.18
plays 0.18

jquery
div 0.44
div_style_textalign 0.35
td_align 0.31
td 0.23
abbr 0.2

mysql
tin

## Build a Dataframe out of data

In [11]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [12]:
df = pd.concat(df_list, axis=0)
df['term'] = df['term'].astype(str)
df.head()

Unnamed: 0,label,term
0,.net,net
1,.net,using
2,.net,like
3,.net,get
4,.net,cangrowtrue_cangrow


In [13]:
df.shape

(1059460, 2)

---
## Sample data

In [14]:
n = 1000
sampled_data_list = []
grouped_df = df.groupby('label')

for group_name, group_df in grouped_df:
    top_n_df = group_df.iloc[:n]
    sampled_data_list.append(top_n_df)

In [15]:
sampled_df = pd.concat(sampled_data_list)
sampled_df['term'] = sampled_df['term'].astype(str)
sampled_df.head()

Unnamed: 0,label,term
0,.net,net
1,.net,using
2,.net,like
3,.net,get
4,.net,cangrowtrue_cangrow


In [16]:
sampled_df.shape

(20000, 2)

In [17]:
sampled_df.label.unique()

array(['.net', 'android', 'angularjs', 'asp.net', 'c', 'c#', 'c++', 'css',
       'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby-on-rails', 'sql'],
      dtype=object)

---
## Save ranking to a file

In [18]:
ranking_filename = f'ranking_{dataset_name}_baseline.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [19]:
ranking_filepath

'../../data/terms/ranking_stackoverflow_baseline.xlsx'

In [20]:
sampled_df.to_excel(ranking_filepath)

---