# Initial Setup

In [1]:
import sys
sys.path.insert(0, "..")

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/jethro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jethro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jethro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jethro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Data Processing
In this notebook, we will be processing the data in the ACM Dataset.

The first step is to understand the information we want to extract from our dataset.

In [5]:
PATH="/diskA/jethro/acm"

In [6]:
import glob
files = glob.glob(f'{PATH}/*.xml')
files[:5]

['/diskA/jethro/acm/PROC-PROMISE12-2012-2365324.xml',
 '/diskA/jethro/acm/PROC-SCOPES07-2007-1269843.xml',
 '/diskA/jethro/acm/PROC-WSC78-1978-800252.xml',
 '/diskA/jethro/acm/PROC-PMAM12-2012-2141702.xml',
 '/diskA/jethro/acm/PROC-SIGIR87-1987-42005.xml']

# Exploring the Data

In [7]:
from urop.xml import ACMCorpus

texts = ACMCorpus(files)

# Data Cleaning

In [8]:
from urop.nlp import clean_text
clean_texts = [clean_text(text) for text in texts]

2018-03-11 10:09:20,769 : INFO : Parsing /diskA/jethro/acm/PROC-PROMISE12-2012-2365324.xml
2018-03-11 10:09:20,831 : INFO : Parsing /diskA/jethro/acm/PROC-SCOPES07-2007-1269843.xml
2018-03-11 10:09:24,279 : INFO : Parsing /diskA/jethro/acm/PROC-WSC78-1978-800252.xml
2018-03-11 10:09:27,675 : INFO : Parsing /diskA/jethro/acm/PROC-PMAM12-2012-2141702.xml
2018-03-11 10:09:31,858 : INFO : Parsing /diskA/jethro/acm/PROC-SIGIR87-1987-42005.xml
2018-03-11 10:09:37,605 : INFO : Parsing /diskA/jethro/acm/PROC-AAMAS05-2005-1082473.xml
2018-03-11 10:10:11,665 : INFO : Parsing /diskA/jethro/acm/PROC-FORTH89-1989-73312.xml
2018-03-11 10:10:11,674 : INFO : Parsing /diskA/jethro/acm/PROC-SODA92-1992-139404.xml
2018-03-11 10:10:20,878 : INFO : Parsing /diskA/jethro/acm/PROC-SCCG07-2007-2614348.xml
2018-03-11 10:10:25,553 : INFO : Parsing /diskA/jethro/acm/PROC-ICSE01-2001-381473.xml
2018-03-11 10:10:38,841 : INFO : Parsing /diskA/jethro/acm/PROC-STOC79-1979-800135.xml
2018-03-11 10:10:44,868 : INFO : 

In [9]:
from gensim import corpora
dct = corpora.Dictionary(clean_texts)

2018-03-11 19:53:59,248 : INFO : 'pattern' package not found; tag filters are not available for English
2018-03-11 19:53:59,251 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-11 19:54:04,000 : INFO : adding document #10000 to Dictionary(471372 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:54:09,481 : INFO : adding document #20000 to Dictionary(843831 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:54:14,496 : INFO : adding document #30000 to Dictionary(1122489 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:54:19,644 : INFO : adding document #40000 to Dictionary(1405375 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:54:24,632 : INFO : adding document #50000 to Dictionary(1657547 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:54

In [10]:
dct.filter_extremes(no_below=10, no_above=0.5)
dct.save(f'{PATH}/dictionary.pkl')

2018-03-11 19:57:24,679 : INFO : discarding 2048255 tokens: [('achteren', 27), ('acm', 149620), ('analysis', 147456), ('application', 169694), ('approach', 168547), ('asearlyas', 2), ('aslateas', 3), ('businterface', 23), ('carinfotainment', 1), ('case', 169952)]...
2018-03-11 19:57:24,680 : INFO : keeping 100000 tokens which were in no less than 10 and no more than 133108 (=50.0%) documents
2018-03-11 19:57:25,126 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['absence', 'absorption', 'abstraction', 'access', 'account']...)
2018-03-11 19:57:25,227 : INFO : saving Dictionary object under /diskA/jethro/acm/dictionary.pkl, separately None
2018-03-11 19:57:25,346 : INFO : saved /diskA/jethro/acm/dictionary.pkl


In [11]:
corpus = [dct.doc2bow(text) for text in clean_texts]

In [12]:
from gensim.corpora.mmcorpus import MmCorpus
MmCorpus.serialize(f'{PATH}/corpus.mm', corpus)

2018-03-11 19:59:11,436 : INFO : storing corpus in Matrix Market format to /diskA/jethro/acm/corpus.mm
2018-03-11 19:59:12,288 : INFO : saving sparse matrix to /diskA/jethro/acm/corpus.mm
2018-03-11 19:59:12,288 : INFO : PROGRESS: saving document #0
2018-03-11 19:59:12,529 : INFO : PROGRESS: saving document #1000
2018-03-11 19:59:12,737 : INFO : PROGRESS: saving document #2000
2018-03-11 19:59:12,957 : INFO : PROGRESS: saving document #3000
2018-03-11 19:59:13,161 : INFO : PROGRESS: saving document #4000
2018-03-11 19:59:13,442 : INFO : PROGRESS: saving document #5000
2018-03-11 19:59:13,746 : INFO : PROGRESS: saving document #6000
2018-03-11 19:59:14,031 : INFO : PROGRESS: saving document #7000
2018-03-11 19:59:14,252 : INFO : PROGRESS: saving document #8000
2018-03-11 19:59:14,516 : INFO : PROGRESS: saving document #9000
2018-03-11 19:59:14,776 : INFO : PROGRESS: saving document #10000
2018-03-11 19:59:15,000 : INFO : PROGRESS: saving document #11000
2018-03-11 19:59:15,295 : INFO : 