# Dezeen Mining

## Import text and tokenize

In [7]:
import re, string
from itertools import chain
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from nltk import bigrams

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token for token in tokens]
    return tokens

#stopwords
with open("stopwords.txt", 'rb') as f:
    twitterstop = [word.strip() for word in f.readlines()]
stop = stopwords.words('english') + list(string.punctuation) + twitterstop


#load the posts
with open("dezeentech_allposts.txt", "rb") as d:
    dezeenpoststxt = ''.join(str(d.readlines())
wordlist = dezeenpoststext.split()
wordlist_lower = [x.lower() for x in wordlist]

TypeError: sequence item 0: expected str instance, bytes found

----------------
## WORD FREQUENCY
----------------

### Count All words

In [1]:
total_words = len(wordlist_lower)
print('Total number of words in the collection: ', total_words)

NameError: name 'wordlist_lower' is not defined

### remove stop and other words (stemming)

In [None]:
porter = nltk.PorterStemmer()
meaningful_list = [porter.stem(term) for term in wordlist_lower if term not in stop and not term.startswith('http') and len(term)>2]
print('Total number of meaningful words (without stopwords): ', len(meaningful_list))

### Count terms only once, equivalent to Document Frequency

In [None]:
terms_single = set(meaningful_list)
print('Number of unique terms: ', len(terms_single))

### Word frequency for all terms

In [None]:
wordfreq = FreqDist(meaningful_list)
print('The 200 most frequent terms, including special terms: ', wordfreq.most_common(200))

### Word frequency for terms only

In [None]:
print('The 200 most frequent terms (terms only): ', termonlyfreq.most_common(200))

## PLOT RESTULTS

In [None]:
import numpy as np
import matplotlib.pyplot as plt

popularwords = termonlyfreq.most_common(100)

labels, values = zip(*popularwords.items())
# sort your values in descending order
indSort = np.argsort(values)[::-1]
# rearrange your data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]
indexes = np.arange(len(labels))
bar_width = 0.35
plt.bar(indexes, values)
# add labels
plt.xticks(indexes + bar_width, labels)
plt.show()

# author_names = counter.keys()
# author_counts = counter.values()
#
# # Plot histogram using matplotlib bar().
# indexes = np.arange(len(author_names))
# width = 0.7
# plt.bar(indexes, author_counts, width)
# plt.xticks(indexes + width * 0.5, author_names)
# plt.show()

## Keyword extraction

Using the RAKE algorithm [Python implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm] (https://github.com/zelandiya/RAKE-tutorial)

In [None]:
import rake, operator
rake_object = rake.Rake("SmartStoplist.txt", 4, 2, 4) # words of minimum length 4, in groups of maximum 2, occurring at least 3 times in the text; this happens to give the best results with the particular corpus

keywords = rake_object.run(tweets)
#write the list of tuples to a file:
outfile = open('keyword_extractedkwords.txt', 'w')
for item in keywords:
  keyword = item[0]
  relevance = item[1]
  try:
    outfile.write(str(keyword.decode('utf-8'))+' '+str(relevance)+'\n')
  except UnicodeEncodeError:
    outfile.write(str(keyword)+' '+str(relevance)+'\n')

Using the [keyword parser script](https://github.com/naushadzaman/keyword-extraction-from-tweets) (made for twitter specifically) previously used on tweets as well.

In [None]:
tweets = '' # tweets are an empty string
for item in tweets_l:
  terms_only = [term for term in preprocess(unicode(item, errors='ignore')) if term not in stop]
  for terms in terms_only:
    tweets = tweets + ' ' + terms

import keyword_extraction_w_parser
keywords2=keyword_extraction_w_parser.get_keywords(tweets)
outfile = open('keywords_tweets_extractedkwords.txt', 'w')
for item in keywords2:
  try:
    outfile.write(str(item.decode('utf-8'))+'\n')
  except UnicodeEncodeError:
    outfile.write(str(item)+'\n'

----------------
## N - GRAMS
----------------

In [None]:
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

find_ngrams(wordlist, 3)