# Basic text minning reading process

In [1]:
# Loading necessary library
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# sample text for text minning process
corpus = ["This is the first document.",
         "This is the second documnet.",
         "Third document. Document number three",
         "Number four. To repeat, number four"]

In [3]:
# Initialization "CountVectorizer" 
vectorizer = CountVectorizer()

In [4]:
# Fitting and transfroming object
bag_of_words = vectorizer.fit_transform(corpus)

In [5]:
bag_of_words

<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

4x13 matrix, total 4 documnents and total vocabulary of 13 words

In [6]:
# Print "bag_of_words"
print(bag_of_words)

  (0, 10)	1
  (0, 4)	1
  (0, 8)	1
  (0, 2)	1
  (0, 0)	1
  (1, 10)	1
  (1, 4)	1
  (1, 8)	1
  (1, 7)	1
  (1, 1)	1
  (2, 0)	2
  (2, 9)	1
  (2, 5)	1
  (2, 11)	1
  (3, 5)	2
  (3, 3)	2
  (3, 12)	1
  (3, 6)	1


In [7]:
# Accessing ID that corresponds to a particlar word by calling vectorizer
# here word "document" corresponds to ID = 0
vectorizer.vocabulary_.get('document')

0

In [8]:
# It will give access to all words in our vocabulary.
vectorizer.vocabulary_

{'this': 10,
 'is': 4,
 'the': 8,
 'first': 2,
 'document': 0,
 'second': 7,
 'documnet': 1,
 'third': 9,
 'number': 5,
 'three': 11,
 'four': 3,
 'to': 12,
 'repeat': 6}

In [9]:
# converting bag_of_words into a DF
pd.DataFrame(bag_of_words.toarray(),columns=vectorizer.get_feature_names())

Unnamed: 0,document,documnet,first,four,is,number,repeat,second,the,third,this,three,to
0,1,0,1,0,1,0,0,0,1,0,1,0,0
1,0,1,0,0,1,0,0,1,1,0,1,0,0
2,2,0,0,0,0,1,0,0,0,1,0,1,0
3,0,0,0,2,0,2,1,0,0,0,0,0,1


#### Here rows represents unique word available in corpus and columns represents number of documents available in corpus

### TF-IDF (Term frequency, Inverse documents frequency )

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Initialization "TfidfVectorizer" 
vectorizer_tf = TfidfVectorizer()

In [12]:
# Fitting and transfroming object
bag_of_words_tf = vectorizer_tf.fit_transform(corpus)

In [13]:
print(bag_of_words_tf)

  (0, 0)	0.4222466008506261
  (0, 2)	0.5355662725381126
  (0, 8)	0.4222466008506261
  (0, 4)	0.4222466008506261
  (0, 10)	0.4222466008506261
  (1, 1)	0.5086718718935652
  (1, 7)	0.5086718718935652
  (1, 8)	0.401042746469996
  (1, 4)	0.401042746469996
  (1, 10)	0.401042746469996
  (2, 11)	0.4424621378947393
  (2, 5)	0.348842231691988
  (2, 9)	0.4424621378947393
  (2, 0)	0.697684463383976
  (3, 6)	0.3432724906138499
  (3, 12)	0.3432724906138499
  (3, 3)	0.6865449812276998
  (3, 5)	0.5412799489419371


###### ID combination associated with a score

In [14]:
# coverting "bag_of_words_tf" into dataframe formate
pd.DataFrame(bag_of_words_tf.toarray(),columns=vectorizer_tf.get_feature_names())

Unnamed: 0,document,documnet,first,four,is,number,repeat,second,the,third,this,three,to
0,0.422247,0.0,0.535566,0.0,0.422247,0.0,0.0,0.0,0.422247,0.0,0.422247,0.0,0.0
1,0.0,0.508672,0.0,0.0,0.401043,0.0,0.0,0.508672,0.401043,0.0,0.401043,0.0,0.0
2,0.697684,0.0,0.0,0.0,0.0,0.348842,0.0,0.0,0.0,0.442462,0.0,0.442462,0.0
3,0.0,0.0,0.0,0.686545,0.0,0.54128,0.343272,0.0,0.0,0.0,0.0,0.0,0.343272
