# Data preparation (Text Mining)

## Library dependencies

In [1]:
import sqlite3
import nltk
import matplotlib.pyplot as plt
from collections import OrderedDict
import random

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Sqlite storage

In [3]:
conn = sqlite3.connect('reddit.db')
c = conn.cursor()

## Stopwords

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
print stopwords

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

## Filtering Function

In [5]:
def wordFilter(excluded,wordrow): 
    filtered = [word for word in wordrow if word not in excluded]
    return filtered  

## LowerCasing

In [6]:
def lowerCaseArray(wordrow): 
    lowercased = [word.lower() for word in wordrow]
    return lowercased  

## Process data 1

In [8]:
def data_processing(sql):
    c.execute(sql) #1
    data = {'wordMatrix':[],'all_words':[]}
    row = c.fetchone() #2
    while row is not None:
        wordrow = nltk.tokenize.word_tokenize(row[0]+" "+row[1]) #3
        wordrow_lowercased = lowerCaseArray(wordrow)
        wordrow_nostopwords = wordFilter(stopwords,wordrow_lowercased)
        data['all_words'].extend(wordrow_nostopwords) #4
        data['wordMatrix'].append(wordrow_nostopwords) #5
        row = c.fetchone()#6 
    return data
#1 create pointer to the sqlite data
#2 fetch the data row by row
#3 row[0] is the title, row[1] is the topic text, we turn them into a single text blob
#4 we will use  data['all_words'] for some data exploration
#5 data['wordMatrix'] is a matrix comprised of wordvectors, 1 vector per document.
#6 get a new document from the SQLite database

In [9]:
subreddits = ['datascience','gameofthrones']
data = {}
for subject in subreddits:
    data[subject] = data_processing(sql='''SELECT topicTitle,topicText,topicCategory FROM topics WHERE topicCategory = '''+"'"+subject+"'")

In [10]:
print data['datascience']['wordMatrix'][0]

[u'data', u'science', u'freelancing', u"'m", u'currently', u'masters', u'program', u'studying', u'business', u'analytics', u"'m", u'trying', u'get', u'data', u'freelancing', u'.', u"'m", u'still', u'learning', u'skill', u'set', u'typically', u'see', u'right', u"'m", u'fairly', u'proficient', u'sql', u'know', u'bit', u'r.', u'freelancers', u'find', u'jobs', u'?']


## Explore data: word frequencies

### Category 1: Data Science

#### Word Frequencies

In [11]:
wordfreqs_cat1 = nltk.FreqDist(data['datascience']['all_words'])
plt.hist(wordfreqs_cat1.values(), bins = range(10))
plt.show()

#### Most frequent words within data science posts

In [12]:
print wordfreqs_cat1.most_common(20)

[(u'.', 2833), (u',', 2831), (u'data', 1882), (u'?', 1190), (u'science', 887), (u')', 812), (u'(', 739), (u"'m", 566), (u':', 548), (u'would', 427), (u"'s", 323), (u'like', 321), (u"n't", 288), (u'get', 252), (u'know', 225), (u"'ve", 213), (u'scientist', 211), (u'!', 209), (u'work', 204), (u'job', 199)]


#### Least Frequent terms within data science posts

In [13]:
print wordfreqs_cat1.hapaxes()



### Category 2: Game of Thrones

#### Word Frequencies

In [15]:
wordfreqs_cat2 = nltk.FreqDist(data['gameofthrones']['all_words'])
plt.hist(wordfreqs_cat2.values(), bins = range(10))
plt.show()

#### Most Frequent words within Game of Thrones posts

In [16]:
print wordfreqs_cat2.most_common(20)

[(u'.', 2909), (u',', 2478), (u'[', 1422), (u']', 1420), (u'?', 1139), (u"'s", 886), (u"n't", 494), (u')', 452), (u'(', 426), (u's5', 399), (u':', 380), (u'spoilers', 332), (u'show', 325), (u'would', 311), (u"''", 305), (u'``', 276), (u'think', 248), (u'season', 244), (u'like', 243), (u'one', 238)]


#### Least Frequent words terms within game of thrones posts

In [18]:
print wordfreqs_cat2.hapaxes()

[u'hordes', u'woods', u'comically', u'pack', u'seventy-seven', u"'context", u'shaving', u'kennels', u'differently', u'screaming', u'her-', u'complainers', u'sailed', u'contributed', u'payoff', u'hallucinating', u'admiral', u's05ep07', u'\u2026\u201d', u'reports', u'golden', u'magically', u'dearie', u'insecurity', u'blacksmiths', u'backing', u'90', u'93', u'v=xnvb8yu0zqk', u'household', u'v=j0muvy9fllw', u'reviewing', u'unpaid', u'copious', u'machina', u'//imgur.com/h5mzplr', u'fanaticism', u'gaming', u'outlawed', u'shores', u'destined', u'sickening', u'baggage', u'targaryans', u'wind', u'senators', u'boltona', u'1-10', u'affect', u'fix', u'sixteen', u'silver', u'dickriders', u'barton', u"'signature", u'burial', u'preceded', u'solution', u'dubstep', u'laboratory', u'rw', u'rr', u'ha', u'akinnuoye-agbaje', u're', u'outburst', u"'bad-people", u'corrects', u'\u201cdunk\u201d', u'kashmir', u'absolved', u'potions', u'shacking', u'spinning', u'krissychula', u'170', u'service', u'legs', u'gill

In [19]:
# manual list takes almost every single character on the keyboard
manual_stopwords = [',','.',')',',','(','m',"'m","n't",'e.g',"'ve",'s','#','/','``',"'s","''",'!','r',']','=','[','s','&','%','*','...','1','2','3','4','5','6','7','8','9','10','--',"''",';','-',':']

## Stemming

In [20]:
stemmer = nltk.SnowballStemmer("english")

In [21]:
def wordStemmer(wordrow): 
    stemmed = [stemmer.stem(word) for word in wordrow]
    return stemmed  

## Process data attempt 2

In [22]:
def data_processing(sql,manual_stopwords):
    c.execute(sql) #1
    data = {'wordMatrix':[],'all_words':[]}
    interWordMatrix = []
    interWordList = []
    row = c.fetchone() #2
    while row is not None:
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') #3 
        wordrow = tokenizer.tokenize(row[0]+" "+row[1])       
        wordrow_lowercased = lowerCaseArray(wordrow)
        wordrow_nostopwords = wordFilter(stopwords,wordrow_lowercased)
        wordrow_nostopwords = wordFilter(manual_stopwords,wordrow_nostopwords)#4
        wordrow_stemmed = wordStemmer(wordrow_nostopwords)
        interWordList.extend(wordrow_stemmed)#5
        interWordMatrix.append(wordrow_stemmed)#6
        row = c.fetchone()#7
        
    wordfreqs = nltk.FreqDist(interWordList)#8
    hapaxes = wordfreqs.hapaxes()#9
    for wordvector in interWordMatrix:
        wordvector_nohapexes = wordFilter(hapaxes,wordvector) #10
        data['wordMatrix'].append(wordvector_nohapexes)
        data['all_words'].extend(wordvector_nohapexes)
        
    return data
#1 create pointer to the sqlite data
#2 fetch the data one by one
#3 row[0] is the title, row[1] is the topic text, we turn them into a single text blob
#4 remove manual stopword
#5 temporary wordlist used to remove hapaxes later on
#6 temporary wordmatrix, will become final wordmatrix after hapaxes removal
#7 get a new topic
#8 make a frequency distribution of all terms
#9 get the list of hapaxes
#10 remove the hapaxes

In [23]:
for subject in subreddits:
    data[subject] = data_processing(sql='''SELECT topicTitle,topicText,topicCategory FROM topics WHERE topicCategory = '''+"'"+subject+"'",manual_stopwords=manual_stopwords)

## Explore data: word frequencies (attempt 2)

### Category 1

### Top 20 most common "Data Science" term after more intense data cleansing  

In [36]:
wordfreqs_cat1 = nltk.FreqDist(data['datascience']['all_words'])
print wordfreqs_cat1.most_common(20)

[(u'data', 1971), (u'scienc', 955), (u'would', 418), (u'work', 368), (u'use', 347), (u'program', 343), (u'learn', 342), (u'like', 341), (u'get', 325), (u'scientist', 310), (u'job', 268), (u'cours', 265), (u'look', 257), (u'know', 239), (u'statist', 228), (u'want', 225), (u've', 223), (u'python', 205), (u'year', 204), (u'time', 196)]


### Hapaxes & distribution plot

In [34]:
hapaxes_cat1 = wordfreqs_cat1.hapaxes()
plt.hist(wordfreqs_cat1.values(), bins = range(20))
plt.show()

In [35]:
hapaxes_cat1

[]

### Category 2

### Top 20 most common "Game of Thrones" term after more intense data cleansing  

In [32]:
wordfreqs_cat2 = nltk.FreqDist(data['gameofthrones']['all_words'])
print wordfreqs_cat2.most_common(20)

[(u's5', 426), (u'spoiler', 374), (u'show', 362), (u'episod', 300), (u'think', 289), (u'would', 287), (u'season', 286), (u'like', 282), (u'book', 271), (u'one', 249), (u'get', 236), (u'sansa', 232), (u'scene', 216), (u'cersei', 213), (u'know', 192), (u'go', 188), (u'king', 183), (u'throne', 181), (u'see', 177), (u'charact', 177)]


### Hapaxes & distribution plot

In [33]:
hapaxes_cat2 = wordfreqs_cat2.hapaxes()
plt.hist(wordfreqs_cat2.values(), bins = range(50))
plt.show()

In [28]:
hapaxes_cat2

[]

## Create labeled data

In [39]:
holdoutLength  = 100
#labeled data
labeled_data1 = [(word,'datascience') for word in data['datascience']['wordMatrix'][holdoutLength:]]
labeled_data2 = [(word,'gameofthrones') for word in data['gameofthrones']['wordMatrix'][holdoutLength:]]
labeled_data = []
labeled_data.extend(labeled_data1)
labeled_data.extend(labeled_data2)
#create holdout
holdout_data = data['datascience']['wordMatrix'][:holdoutLength]
holdout_data.extend(data['gameofthrones']['wordMatrix'][:holdoutLength])
holdout_data_labels = ([('datascience') for _ in xrange(holdoutLength)] + [('gameofthrones') for _ in xrange(holdoutLength)])

In [40]:
print len(labeled_data)
print len(holdout_data_labels)
print labeled_data[20]

1647
200
([u'often', u'work', u'normal', u'data', u'vs', u'straight', u'quantiti', u'reason', u'ask', u'seem', u'blog', u'exampl', u'internet', u'see', u'data', u'analysi', u'data', u'scienc', u'usual', u'show', u'trivial', u'exampl', u'obtain', u'straight', u'quantiti', u'make', u'basic', u'visual', u'work', u'data', u'analyst', u'manufactur', u'work', u'warranti', u'claim', u'quantiti', u'howev', u'compar', u'one', u'model', u'versus', u'anoth', u'model', u'use', u'straight', u'quantiti', u'meaningless', u'compar', u'model', u'order', u'make', u'fair', u'comparison', u'normal', u'data', u'divid', u'number', u'warranti', u'claim', u'defect', u'number', u'vehicl', u'sold', u'metric', u'call', u'defect', u'rate', u'warranti', u'claim', u'rate', u'know', u'manufactur', u'facil', u'use', u'term', u'percent', u'non', u'conform', u'know', u'year', u've', u'seen', u'data', u'analyst', u'make', u'mistak', u'use', u'straight', u'quantiti', u'order', u'make', u'comparison', u'ipython', u'notebo

In [41]:
data['datascience']['all_words_dedup'] = list(OrderedDict.fromkeys(data['datascience']['all_words']))
data['gameofthrones']['all_words_dedup'] = list(OrderedDict.fromkeys(data['gameofthrones']['all_words']))

In [42]:
all_words = []
all_words.extend(data['datascience']['all_words_dedup'])
all_words.extend(data['gameofthrones']['all_words_dedup'])

In [43]:
all_words_dedup = list(OrderedDict.fromkeys(all_words))

In [44]:
prepared_data = [({word: (word in x[0]) for word in all_words_dedup}, x[1]) for x in labeled_data]
prepared_holdout_data = [({word: (word in x[0]) for word in all_words_dedup}) for x in holdout_data]

In [45]:
print prepared_data[0]

({u'sunspear': False, u'profici': False, u'pardon': False, u'selyes': False, u'four': False, u'davo': False, u'sleev': False, u'sleep': False, u'nwb': False, u'captain': False, u'hate': False, u'rhaegar': False, u'mahout': False, u'accus': False, u'accur': False, u'sorri': False, u'midseason': False, u'ozzi': False, u'illustr': False, u'concret': False, u'matlab': False, u'lore': False, u'lord': False, u'hedg': False, u'lora': False, u'worth': False, u'merchant': False, u'digit': False, u'everi': False, u'risk': False, u'tudor': False, u'void': False, u'rise': False, u'lurk': False, u'voic': False, u'stolen': False, u'govern': False, u'appar': False, u'rare': False, u'quantit': False, u'school': False, u'traumat': False, u'scholar': False, u'showcas': False, u'duncan': False, u'upload': False, u'correl': False, u'red': False, u'verif': False, u'cmu': False, u'obsess': False, u'garden': False, u'enjoy': False, u'bill': False, u'franc': False, u'naiv': False, u'direct': False, u'histor':

In [46]:
random.shuffle(prepared_data)
train_size = int(len(prepared_data) * 0.75)
train = prepared_data[:train_size]
test = prepared_data[train_size:]

# Text Analytics

## Naive Bayes

In [47]:
nbr_got = 0
nbr_data = 0
for set in train:
    if set[1]=='datascience':
        nbr_data+=1
    if set[1]=='gameofthrones':
        nbr_got+=1
print nbr_got
print nbr_data

720
762


In [48]:
classifier  = nltk.NaiveBayesClassifier.train(train)

In [49]:
nltk.classify.accuracy(classifier, test)

0.9818181818181818

In [50]:
print(classifier.show_most_informative_features(20))

Most Informative Features
                    data = True           datasc : gameof =    365.1 : 1.0
                   scene = True           gameof : datasc =     63.8 : 1.0
                  season = True           gameof : datasc =     62.4 : 1.0
                    king = True           gameof : datasc =     47.6 : 1.0
                      tv = True           gameof : datasc =     45.1 : 1.0
                    kill = True           gameof : datasc =     31.5 : 1.0
                 compani = True           datasc : gameof =     28.5 : 1.0
                 analysi = True           datasc : gameof =     27.1 : 1.0
                 process = True           datasc : gameof =     25.5 : 1.0
                   appli = True           datasc : gameof =     25.5 : 1.0
                research = True           datasc : gameof =     23.2 : 1.0
                  episod = True           gameof : datasc =     22.2 : 1.0
                  market = True           datasc : gameof =     21.7 : 1.0

In [51]:
classified_data = classifier.classify_many(prepared_holdout_data)

In [52]:
cm = nltk.ConfusionMatrix(holdout_data_labels, classified_data)
print cm

              |     g |
              |     a |
              |  d  m |
              |  a  e |
              |  t  o |
              |  a  f |
              |  s  t |
              |  c  h |
              |  i  r |
              |  e  o |
              |  n  n |
              |  c  e |
              |  e  s |
--------------+-------+
  datascience |<77>23 |
gameofthrones |  5<95>|
--------------+-------+
(row = reference; col = test)



## Decision trees

In [53]:
classifier2 = nltk.DecisionTreeClassifier.train(train)

In [54]:
nltk.classify.accuracy(classifier2, test)

0.9333333333333333

In [59]:
print(classifier2.pseudocode(depth=4))

if data == False: 
  if learn == False: 
    if python == False: 
      if tool == False: return 'gameofthrones'
      if tool == True: return 'datascience'
    if python == True: return 'datascience'
  if learn == True: 
    if go == False: 
      if wrong == False: return 'datascience'
      if wrong == True: return 'gameofthrones'
    if go == True: 
      if upload == False: return 'gameofthrones'
      if upload == True: return 'datascience'
if data == True: return 'datascience'



In [56]:
classified_data2 = classifier2.classify_many(prepared_holdout_data)

In [57]:
cm = nltk.ConfusionMatrix(holdout_data_labels, classified_data2)
print cm

              |     g |
              |     a |
              |  d  m |
              |  a  e |
              |  t  o |
              |  a  f |
              |  s  t |
              |  c  h |
              |  i  r |
              |  e  o |
              |  n  n |
              |  c  e |
              |  e  s |
--------------+-------+
  datascience |<26>74 |
gameofthrones |  2<98>|
--------------+-------+
(row = reference; col = test)

