# Dataset creation

In [None]:
from google.colab import drive 

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
!mkdir -p input

In [None]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.csv input && ls -la

total 24
drwxr-xr-x 1 root root 4096 Jan 17 18:13 .
drwxr-xr-x 1 root root 4096 Jan 17 18:08 ..
drwxr-xr-x 1 root root 4096 Jan  8 17:11 .config
drwx------ 5 root root 4096 Jan 17 18:13 gdrive
drwxr-xr-x 2 root root 4096 Jan 17 18:13 input
drwxr-xr-x 1 root root 4096 Jan  6 18:10 sample_data


In [None]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.zip input

In [None]:
!unzip input/Questions -d ./input/ 

Archive:  input/Questions.zip
  inflating: ./input/Questions.csv   


In [None]:
!ls -lRa ./input/

./input/:
total 2522228
drwxr-xr-x 2 root root       4096 Jan 17 18:13 .
drwxr-xr-x 1 root root       4096 Jan 17 18:13 ..
-rw-r--r-- 1 root root 1923682009 Oct  8  2019 Questions.csv
-rw------- 1 root root  593576602 Jan 17 18:13 Questions.zip
-rw------- 1 root root   65475836 Jan 17 18:13 Tags.csv


Prepare dataset

In [None]:
import pandas as pd

In [None]:
# Read questions
df = pd.read_csv("./input/Questions.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [None]:
# Read tags
tags = pd.read_csv("./input/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})
tags.head(5)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [None]:
# Merge tags belonging to one article
tags['Tag'] = tags['Tag'].astype(str)
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: '&'.join(tags))
grouped_tags.head(5)

Id
80                            flex&actionscript-3&air
90       svn&tortoisesvn&branch&branching-and-merging
120                               sql&asp.net&sitemap
180    algorithm&language-agnostic&colors&color-space
260           c#&.net&scripting&compiler-construction
Name: Tag, dtype: object

In [None]:
grouped_tags.reset_index()
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})
grouped_tags_final.head(5)

Unnamed: 0,Id,Tags
0,80,flex&actionscript-3&air
1,90,svn&tortoisesvn&branch&branching-and-merging
2,120,sql&asp.net&sitemap
3,180,algorithm&language-agnostic&colors&color-space
4,260,c#&.net&scripting&compiler-construction


In [None]:
# Merge dataframes
df.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)
df = df.merge(grouped_tags_final, on='Id')
df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex&actionscript-3&air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn&tortoisesvn&branch&branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql&asp.net&sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm&language-agnostic&colors&color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#&.net&scripting&compiler-construction


In [None]:
# Remove Questions with low score (bad quality)
new_df = df[df['Score']>5]
new_df.shape, new_df['Body'][0]

((72950, 5),
 '<p>I\'ve written a database generation script in <a href="http://en.wikipedia.org/wiki/SQL">SQL</a> and want to execute it in my <a href="http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime">Adobe AIR</a> application:</p>\n\n<pre><code>Create Table tRole (\n      roleID integer Primary Key\n      ,roleName varchar(40)\n);\nCreate Table tFile (\n    fileID integer Primary Key\n    ,fileName varchar(50)\n    ,fileDescription varchar(500)\n    ,thumbnailID integer\n    ,fileFormatID integer\n    ,categoryID integer\n    ,isFavorite boolean\n    ,dateAdded date\n    ,globalAccessCount integer\n    ,lastAccessTime date\n    ,downloadComplete boolean\n    ,isNew boolean\n    ,isSpotlight boolean\n    ,duration varchar(30)\n);\nCreate Table tCategory (\n    categoryID integer Primary Key\n    ,categoryName varchar(50)\n    ,parent_categoryID integer\n);\n...\n</code></pre>\n\n<p>I execute this in Adobe AIR using the following methods:</p>\n\n<pre><code>public static function 

# Dataset preprocessing

In [None]:
!pip install sklearn



In [None]:
new_df.dropna(subset=['Tags'], inplace=True)
new_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


(72950, 5)

In [None]:
!pip install beautifulsoup4



In [None]:
# Remove html tags
from bs4 import BeautifulSoup
import re

def clean_text(text):
  text = text.lower()
  text = re.sub('<code>(.|\n)*?<\/code>', '', text)
  text = BeautifulSoup(text).get_text()
  text = re.sub('\n', ' ', text)
  return text

new_df['Body'] = new_df['Body'].apply(lambda x: clean_text(x)) 
new_df['Body'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


"i've written a database generation script in sql and want to execute it in my adobe air application:  i execute this in adobe air using the following methods:  no errors are generated, however only  exists. it seems that it only looks at the first query (up to the semicolon- if i remove it, the query fails). is there a way to call multiple queries in one statement? "

In [None]:
# Merge title with body
new_df['Body'] = new_df[['Title', 'Body']].agg(' '.join, axis=1)
new_df['Body'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


"SQLStatement.execute() - multiple queries in one statement i've written a database generation script in sql and want to execute it in my adobe air application:  i execute this in adobe air using the following methods:  no errors are generated, however only  exists. it seems that it only looks at the first query (up to the semicolon- if i remove it, the query fails). is there a way to call multiple queries in one statement? "

# Main modeling methods

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.8MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/40/866cbfac4601e0f74c7303d533a9c5d4a53858bd402e08e3e294dd271f25/transformers-4.2.1-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 9.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 44.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     

In [None]:
# Form n-grams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

n_gram_range = (1, 1)
top_n = 5

def predict_keywords(doc, model = 'distilbert-base-nli-mean-tokens'):
  count = CountVectorizer(ngram_range=n_gram_range, stop_words=ENGLISH_STOP_WORDS).fit([doc])
  candidates = count.get_feature_names()

  model = SentenceTransformer(model)
  doc_embedding = model.encode([doc])
  candidate_embeddings = model.encode(candidates)
  distances = cosine_similarity(doc_embedding, candidate_embeddings)
  keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
  return keywords

# Evaluation

In [None]:
# Evaluation
from operator import eq 

def compute_diff(expected_keywords, actual_keywords):
  '''
  true positive
  true negative - 0  # N tags - (true positive + false positive)
  false positive - experts do not have, but we have
  false negative - have experts, but we do not have
  '''
  expected_keywords = set(expected_keywords)
  tp = len(expected_keywords.intersection(actual_keywords))
  fp = len(actual_keywords) - tp
  fn = len(expected_keywords) - tp
  return (tp, fp, fn)

def evaluate(tp, fp, fn):
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  f1 = 2 * (precision * recall) / (precision + recall) if (precision != 0 or recall != 0) else 0.0
  return (precision, recall, f1)

In [None]:
test_df = new_df.sample(n=1000)
test_df.drop(columns=['Id', 'Score', 'Title'], inplace=True)
test_df.head(5)

Unnamed: 0,Body,Tags
28524,Blogging with R: easy way to embed R in a blog...,wordpress&r&blogs
9057,How to alloc a dynamic typed object i have see...,objective-c&dynamic&allocation
341542,Find rank of a decimal number based on functio...,c&performance&algorithm&bit-manipulation&time-...
345808,Why some arithmetic operations take more time ...,c&performance&time
32561,Strange Java cast exception. Why can't I cast ...,java&casting


In [None]:
tp = 0
fp = 0
fn = 0
i = 0
for index, row in test_df.iterrows():
  expected_keywords = row['Tags'].split('&')
  predicted_keywords = predict_keywords(row['Body'])
  tmp_tp, tmp_fp, tmp_fn = compute_diff(expected_keywords, predicted_keywords)
  tp += tmp_tp
  fp += tmp_fp
  fn += tmp_fn
  i += 1
  if (i % 100 == 0):
    print('Executed iteration: ', i)
  if (i == 1):
    print('Expected keywords: ', expected_keywords, ' Predicted keywords: ', predicted_keywords)

(tp, fp, fn)

100%|██████████| 245M/245M [00:13<00:00, 17.9MB/s]


Expected keywords:  ['wordpress', 'r', 'blogs']  Predicted keywords:  ['wordpress', 'mediawiki', 'wiki', 'blog', 'blogging']
Executed iteration:  100
Executed iteration:  200
Executed iteration:  300
Executed iteration:  400
Executed iteration:  500
Executed iteration:  600
Executed iteration:  700
Executed iteration:  800
Executed iteration:  900
Executed iteration:  1000


(692, 4304, 2352)

In [None]:
evaluate(tp, fp, fn)

(0.13851080864691753, 0.22733245729303547, 0.1721393034825871)