In [None]:
"""
Last amended: 24th June, 2020
My folder: C:\Users\ashok\OneDrive\Documents\sentiment_analysis
Virtual Machine: lubuntu_machinelearning_I

Objective:
    Text clustering of wiki documents

About our text files:
    Our files are on following subjects:
            1. Quantum Mechanics (file name has: q)
            2. Religion          (file name has: r)
            3. Legal             (file name has: l)
            4. Psychology        (file name has: p)
    Total text files 12

TODO
    HASHVECTORIZER

"""

In [4]:
###################### 1. Call libraries #####################
# 1.0 Clear memory
%reset -f
# 1.1 Array and data-manipulation libraries
import numpy as np
import pandas as pd

# 1.2 sklearn modeling libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# 1.3 Text processing module
import re

# 1.4 Miscellenous
import os

In [5]:
# 1.4 IPython Notebook cell multiple outputs
#     https://stackoverflow.com/a/42476224/3282777
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# 2.0 Understanding cleaning of text using 
#     regular expressions:

## 2.1 Regular Expression usage through re module:
#      Raw string notations are generally used:
#       '\n' is a single char--newline
#       r'\n' a 2-char string of \ and n

# 2.2 Replace bracketed numbers with space
x = "[8]OK good[6] [6] [5]done"
result= re.sub(r'[\[0-9\]]',' ', x)
result

# 2.3 Remove newlines
x = "OK \n good\n  \ndone"
result= re.sub('\n',' ', x)
result= re.sub(r'\n',' ', x)
result

# 2.5 Remove apostrophe
x= "Planck's solution"
x="After that it's just a matter "
result= re.sub('\'s',' ', x)      # Either this
result= re.sub('[\'s]',' ', x)    # Or this  
result

# 2.6 Remove html tags
#     https://stackoverflow.com/a/12982689/3282777
#     https://stackoverflow.com/a/3075150/3282777
x = " <title>Cultural universal</title>      <ns> </ns>      <id>       </id>      <revision>        <id>         </id>        <parentid>         </parentid>        <timestamp>    -  -  T  :  :  Z</timestamp>"

# 2.6.1 Compiling creates a pattern object
#       A pattern object also has its own methods/attribtes
clean_greedy = re.compile('<.*>') # This object is greedy
clean = re.compile('<.*?>')       # This object is not greedy
re.sub(clean, "", x)
re.sub(clean_greedy, "", x)

'   OK good           done'

'OK   good    done'

'After that it   ju t a matter '

' Cultural universal                                                                        -  -  T  :  :  Z'

' '

In [8]:
# 3.0 Read files as text streams and also clean them
#     https://docs.python.org/3/library/io.html#text-i-o

pathToclusteringFiles="D:\\data\\OneDrive\\Documents\\GitHub\\Clustering\\text_clustering\\textclustering_datasets"

os.chdir(pathToclusteringFiles)
os.listdir()
len(os.listdir())   # 12 txt files

# 3.1 Experiment: Understanding a text-stream
#     Python creates a text-stream when reading
#     a text file. That is, rather than reading
#     a file in one go, one can read line-by-line
#     or a specific number of chars at a time

filelist = os.listdir()

# 3.1.1 Create a text stream or an iterator that outputs text
#       on demand
text_stream = open(filelist[0], "r",  encoding="utf8")
type(text_stream)      # TextIOWrapper

# 3.1.2 Some attributes/methods
text_stream.encoding
text_stream.read(10)   # Read at most 'size' characters
                       #   from stream as a single str
                       #    If size = -1, all chars are read
text_stream.read(10)   # Read next 'size' characters

# 3.1.3 It also behaves as iterable
text_stream = open(filelist[0], "r",  encoding="utf8")
t = text_stream.__iter__()
next(t)

# 3.1.4 Read few lines
text_stream = open(filelist[0], "r",  encoding="utf8")
text_stream.readlines(1)   # Read until newline or EOF and return
                           # a single str within a list
text_stream.readlines(1)   # Read next line or if nothing, return empty string

# 3.1.5 Read all lines as a list
text_stream = open(filelist[0], "r",  encoding="utf8")
out = text_stream.readlines()
out
type(out)    # list
len(out)     # 3

['l1_q.txt',
 'l2_r.txt',
 'l3_l.txt',
 'l4_p.txt',
 'l5_l.txt',
 'l6_q.txt',
 'l7_l.txt',
 'l8_r.txt',
 'q91_p.txt',
 'q92_q.txt',
 'q93_l.txt',
 'q9_q.txt']

12

_io.TextIOWrapper

'utf8'

'\ufeffThere is '

'some confu'

'\ufeffThere is some confusion regarding the relationship between the raising and lowering ladder operators and the creation and annihilation operators commonly used in quantum field theory. The creation operator ai† increments the number of particles in state i, while the corresponding annihilation operator ai decrements the number of particles in state i. This clearly satisfies the requirements of the above definition of a ladder operator: the incrementing or decrementing of the eigenvalue of another operator (in this case the particle number operator).\n'

['\ufeffThere is some confusion regarding the relationship between the raising and lowering ladder operators and the creation and annihilation operators commonly used in quantum field theory. The creation operator ai† increments the number of particles in state i, while the corresponding annihilation operator ai decrements the number of particles in state i. This clearly satisfies the requirements of the above definition of a ladder operator: the incrementing or decrementing of the eigenvalue of another operator (in this case the particle number operator).\n']

['\n',
 'Confusion arises because the term ladder operator is typically used to describe an operator that acts to increment or decrement a quantum number describing the state of a system. To change the state of a particle with the creation/annihilation operators of QFT requires the use of both an annihilation operator to remove a particle from the initial state and a creation operator to add a particle to the final state.']

['\ufeffThere is some confusion regarding the relationship between the raising and lowering ladder operators and the creation and annihilation operators commonly used in quantum field theory. The creation operator ai† increments the number of particles in state i, while the corresponding annihilation operator ai decrements the number of particles in state i. This clearly satisfies the requirements of the above definition of a ladder operator: the incrementing or decrementing of the eigenvalue of another operator (in this case the particle number operator).\n',
 '\n',
 'Confusion arises because the term ladder operator is typically used to describe an operator that acts to increment or decrement a quantum number describing the state of a system. To change the state of a particle with the creation/annihilation operators of QFT requires the use of both an annihilation operator to remove a particle from the initial state and a creation operator to add a particle to the final state.']

list

3

In [9]:
# 3.2
lines = []
for i in os.listdir():
    # 3.2.1 Create text-stream
    text_file = open(i, "r",  encoding="utf8")
    # 3.2.2 Read complete file in a list of strings
    tx = text_file.readlines()
    # 3.2.3 Join all strings in the list
    tx = " ".join(tx)
    # 3.2.4 Create pattern object to remove html tags
    clean = re.compile('<.*?>')  #
    tx = re.sub(clean, '', tx)

    # 3.2.5 Replace bracketed numbers with space
    tx= re.sub(r'[\[0-9\]]',' ', tx)

    # 3.2.6
    tx= re.sub('\n',' ', tx)    # Remove newlines
    tx= re.sub('\'s',' ', tx)   # Remove apostrophes
    tx= re.sub('\'s',' ', tx)

    # 3.2.7 Remove URLs. In MULTILINE mode also matches immediately
    #       after each newline.
    tx = re.sub(r'^https?:\/\/.*[\r\n]*', '', tx, flags=re.MULTILINE)

    # 3.2.8 Other Miscellenous
    tx = re.sub('[*|\(\)\{\}]', " ",tx)
    tx = re.sub('[=]*', "",tx)

    # 3.2.9 Tags may take such forms also
    #       < == &lt;   > == &gt;
    clean = re.compile('&lt;')
    tx = re.sub(clean, '', tx)
    clean = re.compile('&gt;')
    tx = re.sub(clean, '', tx)
    clean = re.compile('&quot;')
    tx = re.sub(clean, '', tx)

    # 3.2.10 Finally append  this file
    #        to our list as a cleaned string
    lines.append(tx)

# 3.3 So what do we have
type(lines)     # list
lines
len(lines)      # 12; same as number of files

list

['\ufeffThere is some confusion regarding the relationship between the raising and lowering ladder operators and the creation and annihilation operators commonly used in quantum field theory. The creation operator ai† increments the number of particles in state i, while the corresponding annihilation operator ai decrements the number of particles in state i. This clearly satisfies the requirements of the above definition of a ladder operator: the incrementing or decrementing of the eigenvalue of another operator  in this case the particle number operator .    Confusion arises because the term ladder operator is typically used to describe an operator that acts to increment or decrement a quantum number describing the state of a system. To change the state of a particle with the creation/annihilation operators of QFT requires the use of both an annihilation operator to remove a particle from the initial state and a creation operator to add a particle to the final state.',
 '\ufeffReligio

12

In [10]:
# 3.5 Clustering text documents
## One Way: Just use 'tf' and not 'idf'

#  3.5.1 Convert a collection of text documents to a matrix
#        of token counts. This implementation produces a
#        sparse representation of the counts using
#        scipy.sparse.csr_matrix.
#        https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vec = CountVectorizer()
matrix = vec.fit_transform(lines)
matrix
matrix.shape   # (12, 1466)

# 3.5.2 Let us see this sparse matrix in a dataframe
#       We have token-counts
pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

<12x1466 sparse matrix of type '<class 'numpy.int64'>'
	with 2200 stored elements in Compressed Sparse Row format>

(12, 1466)

Unnamed: 0,about,above,abstract,academic,accept,accepted,access,accessible,according,accounts,...,world,worldwide,would,writing,written,yalom,year,yoga,yvonne,zero
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,0,0,0,0,0,...,4,1,0,0,0,0,0,0,0,0
2,0,1,0,0,1,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
3,0,0,0,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,0,1,0,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
8,3,0,0,0,0,0,0,0,0,0,...,3,0,1,0,0,2,0,0,1,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
## 3.6 Better way
#      Use both tf and idf
#      https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vec = TfidfVectorizer(use_idf=True,  stop_words='english')
matrix = vec.fit_transform(lines)
matrix.shape   # (12, 1311)  # Stop words have been removed

# 3.6.1 Have a look at our tf-idf values
pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

(12, 1311)

Unnamed: 0,abstract,academic,accept,accepted,access,accessible,according,accounts,accurately,accused,...,workings,world,worldwide,writing,written,yalom,year,yoga,yvonne,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.052695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.061358,0.167124,0.061358,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.035079,0.030126,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.023887,0.0,0.030126,0.035079,0.0,0.0,0.0,0.0,0.0
3,0.0,0.106887,0.0,0.0,0.0,0.0,0.0,0.06223,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.025826,0.0,0.0,0.051652,0.0,0.0,0.030072,0.0,0.025826,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090216
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.039636,0.0,0.0,0.0,0.046152,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.037056,0.0,0.0,0.0,0.03628,0.0,0.0,0.01814,0.0
9,0.070213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070213,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# 3.7 Finally start clustering
number_of_clusters=  4   # As many as subjects of documents in our corpus

# 3.71. Instantiate KMeans object
km = KMeans(
            n_clusters=number_of_clusters,
            max_iter=500
            )

# 3.72 Train our model
km.fit(matrix)

# 3.7.3
km.labels_

# 3.7.4
km.inertia_   # 6.731

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

array([2, 1, 0, 3, 0, 2, 0, 1, 3, 2, 0, 2])

6.731785416672499

In [25]:
re.split(r'_','l1_q.txt')

# 4.0 Let us arrange our results
#     Put filenames and cluster-labels at one place

# 4.1 First, we modify filenames. Example:
re.split(r'_','l1_q.txt')    # Just see what happens

# 4.2
results = pd.DataFrame()
modified_filenames = []
for i in os.listdir():
    modified_filenames.append(re.split(r'_',i)[1])

# 4.2.1
modified_filenames

# 4.2.2
results['text_file'] = modified_filenames
results['cluster_label'] = km.labels_
results

# 4.2.3 Sort on 'text' column to
#       clearly see match of cluster
#       labels with filenames
results.sort_values('text_file')
print("Text clustering is perfect. Files on the same subject")
print("(l)aw,(p)sychology,(q)uantum-mechanics and (r)eligion")
print ("are in the same cluster")

['l1', 'q.txt']

['l1', 'q.txt']

['q.txt',
 'r.txt',
 'l.txt',
 'p.txt',
 'l.txt',
 'q.txt',
 'l.txt',
 'r.txt',
 'p.txt',
 'q.txt',
 'l.txt',
 'q.txt']

Unnamed: 0,text_file,cluster_label
0,q.txt,2
1,r.txt,1
2,l.txt,0
3,p.txt,3
4,l.txt,0
5,q.txt,2
6,l.txt,0
7,r.txt,1
8,p.txt,3
9,q.txt,2


Unnamed: 0,text_file,cluster_label
2,l.txt,0
4,l.txt,0
6,l.txt,0
10,l.txt,0
3,p.txt,3
8,p.txt,3
0,q.txt,2
5,q.txt,2
9,q.txt,2
11,q.txt,2


Text clustering is perfect. Files on the same subject
(l)aw,(p)sychology,(q)uantum-mechanics and (r)eligion
are in the same cluster


## Understanding Regular Expressions
Refer: https://docs.python.org/3/library/re.html <br>
       https://docs.python.org/3/howto/regex.html#regex-howto<br>
Regular expressions can contain both special and ordinary characters. Most ordinary characters, 
like 'A', 'a', or '0', are the simplest regular expressions; they simply match themselves. You can concatenate ordinary characters, so last matches the string 'last'.<br>

  .<br>(Dot.) Matches any character except a newline.<br>
  ^<br>(Caret.) Matches start of the string<br>
  $<br>(Dollar) Matches end of the string<br>
  \*<br>(Star) Causes the resulting RE to match 0 or more repetitions. ab* will match ‘a’, ‘ab’, or ‘a’ followed by any number of ‘b’s.<br>
  \+<br>Causes the resulting RE to match 1 or more repetitions of the preceding RE. ab+ will match ‘a’ followed by any non-zero number of ‘b’s <br>
  ?<br>Causes the resulting RE to match 0 or 1 repetitions of the preceding RE.<br>
  []<br>Used to indicate a set of characters. In a set: Characters can be listed individually, e.g. [amk] will match 'a', 'm', or 'k'. Ranges of characters can be indicated by giving two characters and separating them by a '-', for example [a-z] will match any lowercase ASCII letter, [0-5][0-9] will match all the two-digits numbers from 00 to 59, and [0-9A-Fa-f] will match any hexadecimal digit. Special characters lose their special meaning inside sets. For example, [(+*)] will match any of the literal characters  '(', '+', '*', or ')'. Character classes such as \w or \S are also accepted inside a set, Characters that are not within a range can be matched by complementing the set. If the first character of the set is '^', all the characters that are not in the set will be matched. For example, [^5] will match any character except '5'. ^ has no special meaning if it’s not the first character in the set.  To match a literal ']' inside a set, precede it with a backslash, or place it at the beginning of the set. <br>
{m,n}<br>
    Causes resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible. For example, a{3,5} will match from 3 to 5 'a' characters. Omitting m specifies a lower bound of zero, and omitting n specifies an infinite upper bound. <br>

Understanding Greedy vs non-greedy search<br>
Ref: https://stackoverflow.com/a/3075150/3282777<br>
  >Let your string input be: 101000000000100. <br>
  >Using 1.\*1, * is greedy - it will match all the way to the end, and then backtrack until it can match 1, leaving you with 1010000000001. <br>
  >.*? is non-greedy. * will match nothing, but then will try to match extra characters until it matches 1, eventually matching 101. <br>


In [None]:
################## Done ###############