# Dataset creation

In [27]:
from google.colab import drive 

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [28]:
!mkdir -p input

In [29]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.csv input && ls -la

total 24
drwxr-xr-x 1 root root 4096 Jan 13 14:06 .
drwxr-xr-x 1 root root 4096 Jan 13 13:48 ..
drwxr-xr-x 1 root root 4096 Jan  8 17:11 .config
drwx------ 5 root root 4096 Jan 13 14:06 gdrive
drwxr-xr-x 2 root root 4096 Jan 13 13:51 input
drwxr-xr-x 1 root root 4096 Jan  6 18:10 sample_data


In [None]:
!cp /content/gdrive/MyDrive/itmo_master/3/DT_NLP/dataset/*.zip input

In [31]:
!unzip input/Questions -d ./input/ 

Archive:  input/Questions.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of input/Questions or
        input/Questions.zip, and cannot find input/Questions.ZIP, period.


In [32]:
!ls -lRa ./input/

./input/:
total 2213272
drwxr-xr-x 2 root root       4096 Jan 13 13:51 .
drwxr-xr-x 1 root root       4096 Jan 13 14:06 ..
-rw-r--r-- 1 root root 1923682009 Oct  8  2019 Questions.csv
-rw------- 1 root root  277217280 Jan 13 14:06 Questions.zip
-rw------- 1 root root   65475836 Jan 13 14:06 Tags.csv


In [1]:
import pandas as pd

In [88]:
# Read questions
df = pd.read_csv("./input/Questions.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [89]:
# Read tags
tags = pd.read_csv("./input/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})
tags.head(5)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [90]:
# Programming languages/frameworks/etc
#selected_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']
NUM = 20
selected_tags = tags['Tag'].value_counts()[:NUM].index.tolist()
selected_tags[:10]

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']

In [91]:
df.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)

In [92]:
def get_tag(tags):
  for tag in tags:
    if tag in selected_tags:
      return tag
  return None

tags['Tag'] = tags['Tag'].astype(str)
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: get_tag(tags))
grouped_tags.head(5)

Id
80     None
90     None
120     sql
180    None
260      c#
Name: Tag, dtype: object

In [93]:
grouped_tags = grouped_tags.dropna()
grouped_tags.head(5), grouped_tags.shape

(Id
 120     sql
 260      c#
 330     c++
 470    .net
 650      c#
 Name: Tag, dtype: object, (850988,))

In [94]:
grouped_tags.reset_index()
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags})
grouped_tags_final.drop(columns=['Id'], inplace=True)
grouped_tags_final.head(5)

Unnamed: 0_level_0,Tags
Id,Unnamed: 1_level_1
120,sql
260,c#
330,c++
470,.net
650,c#


In [95]:
df = df.merge(grouped_tags_final, on='Id')
df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
0,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql
1,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#
2,330,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++
3,470,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net
4,650,79,Automatically update version number,<p>I would like the version property of my app...,c#


# Preprocessing

In [96]:
!pip install nltk
!pip install beautifulsoup4



In [97]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = re.sub(r'\d+', '', text) # remove digits
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    return text


In [99]:
new_df = df
new_df.dropna(subset=['Tags'], inplace=True)
df.head(5), new_df.shape

(    Id  Score  ...                                               Body  Tags
 0  120     21  ...  <p>Has anyone got experience creating <strong>...   sql
 1  260     49  ...  <p>I have a little game written in C#. It uses...    c#
 2  330     29  ...  <p>I am working on a collection of classes use...   c++
 3  470     13  ...  <p>I've been writing a few web services for a ...  .net
 4  650     79  ...  <p>I would like the version property of my app...    c#
 
 [5 rows x 5 columns], (850988, 5))

In [100]:
new_df = new_df.sample(n=40000)
new_df['Body'] = new_df['Body'].apply(clean_text)
new_df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
681327,33127350,1,How to set servlet-mapping in servlets corectl...,using servlets jspi trying delete order web ap...,java
707561,34193980,0,#import <SenTestingKit/SenTestingKit.h> not found,#importsentestingkit sentestingkithframework f...,ios
649442,31700560,0,Error getting android attriblute value in cust...,creating customview textview button change cus...,android
210420,11328320,3,"Nested Form, ""Can't mass-assign protected attr...",relevant part nested formdiv classfield ffield...,ruby-on-rails
627015,30701230,0,Angularfire facebook user_friends,using facebook authentication angularfire dont...,angularjs


In [101]:
new_df['Title'] = new_df['Title'].apply(clean_text)
new_df.head(5)

Unnamed: 0,Id,Score,Title,Body,Tags
681327,33127350,1,set servletmapping servlets corectly get id va...,using servlets jspi trying delete order web ap...,java
707561,34193980,0,#import found,#importsentestingkit sentestingkithframework f...,ios
649442,31700560,0,error getting android attriblute value customv...,creating customview textview button change cus...,android
210420,11328320,3,nested form cant massassign protected attributes,relevant part nested formdiv classfield ffield...,ruby-on-rails
627015,30701230,0,angularfire facebook user_friends,using facebook authentication angularfire dont...,angularjs


In [102]:
new_df['Body'] = new_df[['Title', 'Body']].agg(' '.join, axis=1)
new_df.drop(columns=['Title'], inplace=True)
new_df.head(5)

Unnamed: 0,Id,Score,Body,Tags
681327,33127350,1,set servletmapping servlets corectly get id va...,java
707561,34193980,0,#import found #importsentestingkit sentestingk...,ios
649442,31700560,0,error getting android attriblute value customv...,android
210420,11328320,3,nested form cant massassign protected attribut...,ruby-on-rails
627015,30701230,0,angularfire facebook user_friends using facebo...,angularjs


# Building the model + evaluation

In [103]:
from sklearn.model_selection import train_test_split

X = new_df['Body']
y = new_df['Tags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [104]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((28000,), (12000,), (28000,), (12000,))

In [105]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

sgd = Pipeline([('vect', TfidfVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=selected_tags))

accuracy 0.7243333333333334
               precision    recall  f1-score   support

   javascript       0.40      0.02      0.04       106
         java       0.69      0.92      0.79       998
           c#       0.54      0.14      0.22       148
          php       0.46      0.12      0.19       188
      android       0.75      0.62      0.68       252
       jquery       0.74      0.76      0.75      1437
       python       0.78      0.66      0.71       608
         html       0.35      0.19      0.25       147
          c++       0.60      0.29      0.39       315
          ios       0.63      0.82      0.71       491
        mysql       0.59      0.20      0.30       229
          css       0.81      0.82      0.81      1703
          sql       0.65      0.81      0.72      1701
      asp.net       0.60      0.24      0.34       422
  objective-c       0.75      0.46      0.57       294
ruby-on-rails       0.67      0.09      0.16       151
         .net       0.73      0.91  

In [106]:
#from sklearn.externals import joblib
#joblib.dump(sgd, 'model.pkl')