In [8]:
import re # regular expression processing
import nltk # nltk for conversion of raw Texts
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF library

In [9]:
paragraph_train = """One 1 two 2 three 3 four 4 five 5. Six 6 Seven 7 Eight 8 Nine 9 Ten 10. 
One 1 two 2 three 3 four 4 five 5. Six 6 Seven 7 Eight 8 Nine 9 Ten 10."""

In [10]:
ps = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph_train)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z0-9]', ' ', sentences[i])
    review = review.lower() #converting to lower case
    review = review.split() #converting to list of words
    words = nltk.word_tokenize(sentences[i])
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = map(str, review)
    review = ' '.join(review)
    sentences[i] = ' '.join(words) 
    corpus.append(review)

In [11]:
sentences

['One 1 two 2 three 3 four 4 five 5 .',
 'Six 6 Seven 7 Eight 8 Nine 9 Ten 10 .',
 'One 1 two 2 three 3 four 4 five 5 .',
 'Six 6 Seven 7 Eight 8 Nine 9 Ten 10 .']

In [12]:
corpus

['one 1 two 2 three 3 four 4 five 5',
 'six 6 seven 7 eight 8 nine 9 ten 10',
 'one 1 two 2 three 3 four 4 five 5',
 'six 6 seven 7 eight 8 nine 9 ten 10']

# TF-IDF creation for training Data

In [13]:
cv = TfidfVectorizer()
X = cv.fit_transform(corpus)

In [14]:
cv.get_feature_names()

['10',
 'eight',
 'five',
 'four',
 'nine',
 'one',
 'seven',
 'six',
 'ten',
 'three',
 'two']

In [15]:
X.toarray()

array([[0.        , 0.        , 0.4472136 , 0.4472136 , 0.        ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ,
        0.4472136 ],
       [0.40824829, 0.40824829, 0.        , 0.        , 0.40824829,
        0.        , 0.40824829, 0.40824829, 0.40824829, 0.        ,
        0.        ],
       [0.        , 0.        , 0.4472136 , 0.4472136 , 0.        ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ,
        0.4472136 ],
       [0.40824829, 0.40824829, 0.        , 0.        , 0.40824829,
        0.        , 0.40824829, 0.40824829, 0.40824829, 0.        ,
        0.        ]])

# TF-IDF creation for Testing Data

In [16]:
paragraph_test = "One 1 two 2 three 3 Tenth"

In [17]:
ps = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph_test)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z1-9]', ' ', sentences[i])
    review = review.lower() #converting to lower case
    review = review.split() #converting to list of words
    words = nltk.word_tokenize(sentences[i])
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    sentences[i] = ' '.join(words) 
    corpus.append(review)

In [18]:
corpus

['one 1 two 2 three 3 tenth']

In [19]:
sentences

['One 1 two 2 three 3 Tenth']

In [20]:
X_test = cv.transform(corpus)

In [21]:
X_test.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.57735027]])

In [22]:
cv.get_feature_names()

['10',
 'eight',
 'five',
 'four',
 'nine',
 'one',
 'seven',
 'six',
 'ten',
 'three',
 'two']