# Dataset creation

In [1]:
from google.colab import drive 

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
!mkdir -p input

In [3]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.csv input && ls -la

total 24
drwxr-xr-x 1 root root 4096 Jan 12 19:16 .
drwxr-xr-x 1 root root 4096 Jan 12 16:43 ..
drwxr-xr-x 1 root root 4096 Jan  8 17:11 .config
drwx------ 5 root root 4096 Jan 12 19:16 gdrive
drwxr-xr-x 2 root root 4096 Jan 12 16:56 input
drwxr-xr-x 1 root root 4096 Jan  6 18:10 sample_data


In [4]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.zip input

In [5]:
!unzip input/Questions -d ./input/ 

Archive:  input/Questions.zip
replace ./input/Questions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [6]:
!ls -lRa ./input/

./input/:
total 2522224
drwxr-xr-x 2 root root       4096 Jan 12 16:56 .
drwxr-xr-x 1 root root       4096 Jan 12 19:16 ..
-rw-r--r-- 1 root root 1923682009 Oct  8  2019 Questions.csv
-rw------- 1 root root  593576602 Jan 12 19:17 Questions.zip
-rw------- 1 root root   65475836 Jan 12 19:16 Tags.csv


Prepare dataset

In [7]:
import pandas as pd

In [8]:
# Read questions
df = pd.read_csv("./input/Questions.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [9]:
# Read tags
tags = pd.read_csv("./input/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})
tags.head(5)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [10]:
selected_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']


In [11]:
df.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)

In [12]:
def get_tag(tags):
  for tag in tags:
    if tag in selected_tags:
      return tag
  return None

tags['Tag'] = tags['Tag'].astype(str)
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: get_tag(tags))
grouped_tags.head(5)

Id
80     None
90     None
120     sql
180    None
260      c#
Name: Tag, dtype: object

In [13]:
grouped_tags = grouped_tags.dropna()
grouped_tags.head(5), grouped_tags.shape

(Id
 120     sql
 260      c#
 330     c++
 470    .net
 650      c#
 Name: Tag, dtype: object, (850988,))

In [14]:
grouped_tags.reset_index()
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags})
grouped_tags_final.drop(columns=['Id'], inplace=True)
grouped_tags_final.head(5)

Unnamed: 0_level_0,Tags
Id,Unnamed: 1_level_1
120,sql
260,c#
330,c++
470,.net
650,c#


In [15]:
df = df.merge(grouped_tags_final, on='Id')
df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
0,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql
1,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#
2,330,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++
3,470,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net
4,650,79,Automatically update version number,<p>I would like the version property of my app...,c#


# Preprocessing

In [18]:
!pip install nltk
!pip install beautifulsoup4



In [20]:
!nltk.download('stopwords')

/bin/bash: -c: line 0: syntax error near unexpected token `'stopwords''
/bin/bash: -c: line 0: `nltk.download('stopwords')'


In [34]:
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    return text


In [41]:
new_df = df
new_df.dropna(subset=['Tags'], inplace=True)
df.head(5), new_df.shape

(    Id  Score  ...                                               Body  Tags
 0  120     21  ...  <p>Has anyone got experience creating <strong>...   sql
 1  260     49  ...  <p>I have a little game written in C#. It uses...    c#
 2  330     29  ...  <p>I am working on a collection of classes use...   c++
 3  470     13  ...  <p>I've been writing a few web services for a ...  .net
 4  650     79  ...  <p>I would like the version property of my app...    c#
 
 [5 rows x 5 columns], (850988, 5))

In [42]:
new_df = new_df.sample(n=20000)
new_df['Body'] = new_df['Body'].apply(clean_text)
new_df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
409145,20811050,0,ENTITY RELATION ? How to relate when one of th...,owners owns taxis owners provide taxis cab com...,sql
414866,21078310,3,How can I style layer.control in leaflet.js?,trying change default dropdown menu icon layer...,javascript
786219,37473620,0,Proper way to destroy linked List?,destroy linked list completely instance right ...,c
626733,30689310,2,Anonymous Function Causing Problems,thing giving problems executing anonymous func...,javascript
62513,4069930,3,ASP.NET MVC State Management,working net mvc web application bunch web part...,asp.net


In [43]:
new_df['Title'] = new_df['Title'].apply(clean_text)
new_df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
409145,20811050,0,entity relation relate one component entity co...,owners owns taxis owners provide taxis cab com...,sql
414866,21078310,3,style layercontrol leafletjs,trying change default dropdown menu icon layer...,javascript
786219,37473620,0,proper way destroy linked list,destroy linked list completely instance right ...,c
626733,30689310,2,anonymous function causing problems,thing giving problems executing anonymous func...,javascript
62513,4069930,3,aspnet mvc state management,working net mvc web application bunch web part...,asp.net


In [44]:
new_df['Body'] = new_df[['Title', 'Body']].agg(' '.join, axis=1)
new_df.drop(columns=['Title'], inplace=True)
new_df.head(5)

Unnamed: 0,Id,Score,Body,Tags
409145,20811050,0,entity relation relate one component entity co...,sql
414866,21078310,3,style layercontrol leafletjs trying change def...,javascript
786219,37473620,0,proper way destroy linked list destroy linked ...,c
626733,30689310,2,anonymous function causing problems thing givi...,javascript
62513,4069930,3,aspnet mvc state management working net mvc we...,asp.net


In [48]:
from sklearn.model_selection import train_test_split

X = new_df['Body']
y = new_df['Tags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [49]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((796,), (342,), (796,), (342,))

In [55]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=selected_tags))

accuracy 0.5847953216374269
               precision    recall  f1-score   support

         java       0.00      0.00      0.00         6
         html       0.78      0.78      0.78        36
      asp.net       0.00      0.00      0.00         4
           c#       0.00      0.00      0.00         7
ruby-on-rails       0.12      0.14      0.13         7
       jquery       0.47      0.55      0.50        51
        mysql       0.74      0.57      0.65        40
          php       0.33      0.33      0.33         3
          ios       0.00      0.00      0.00         3
   javascript       0.50      0.36      0.42        11
       python       0.36      0.29      0.32        14
            c       0.65      0.82      0.72        49
          css       0.47      0.75      0.58        28
      android       0.50      0.40      0.44         5
       iphone       0.50      0.40      0.44         5
          sql       0.00      0.00      0.00         8
  objective-c       0.72      0.54  

  _warn_prf(average, modifier, msg_start, len(result))
