In [1]:
import re
import time

import pandas as pd
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords as stp
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


def text_normalize(samples: pd.Series) -> pd.Series:
    samples = samples.apply(lambda x: x.lower()) # convert all samples to lower case

    samples = samples.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x)) # replace any character which is not string or space with blank

    return samples

In [2]:

def text_tokenize(samples: pd.Series) -> pd.Series:
    toktok = ToktokTokenizer() # Much better speed than word_tokenize
    samples = samples.apply(lambda x: toktok.tokenize(x))

    return samples

In [3]:

def remove_stopwords(samples: pd.Series) -> pd.Series:
    stopwords = set(stp.words('english'))
    
    clean_samples = samples.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    return clean_samples

In [4]:

def text_lemmatize(samples: pd.Series) -> pd.Series:
    wnl = WordNetLemmatizer()

    samples = samples.apply(lambda x: wnl.lemmatize(x))

    return samples

In [5]:

def tfidf(samples: pd.Series) -> pd.Series:
    return TfidfVectorizer().fit_transform(samples)

In [6]:
df = pd.read_csv('fakeReviewData.csv')

df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [7]:
df['text_'] = text_normalize(df['text_'])

df['text_'][0]

'love this  well made sturdy and very comfortable  i love itvery pretty'

In [8]:
df['text_'] = remove_stopwords(df['text_'])

In [9]:
df['text_'] = text_lemmatize(df['text_'])

In [10]:
df2 = pd.DataFrame(tfidf(df['text_']))

df2.head()

Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
