In [None]:
import sys
sys.path.append('../preprocessing/')

import pandas as pd
import numpy as np
from preprocess import *

pd.set_option('display.max_colwidth', None)
df = pd.read_csv('../datasets/B2W-Reviews01.csv')

preprocessing_functions = ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords']

'''
In this list, you can add: 

'transform_to_lowercase'
'remove_special_characters' 
'remove_stopwords'
'remove_specific_phrases'
'perform_lemmatization'
'perform_stemming'
'''

processed_df = preprocess_data(data=df, 
                               preprocessing_funcs=preprocessing_functions, 
                               language='portuguese', 
                               column='review_text', target='overall_rating', verbose=1)

display(processed_df[['review_text', 'prep']])

vector, processed_df2 = vectorization(data=processed_df, verbose=1, target='overall_rating', vectorizer='tf-idf')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...


# Using a Pipeline

In [1]:
import sys
sys.path.append('../preprocessing/')

import pandas as pd
import numpy as np
from preprocess import *

pd.set_option('display.max_colwidth', None)
df = pd.read_csv('../datasets/B2W-Reviews01.csv')

from NLPipeline import *

# Define your arguments
preprocessing_funcs = ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords',
                       'remove_specific_phrases']
vectorizer = 'word2vec'#'tf-idf'

# Create the pipeline
pipeline = NLP_helper(preprocessing_funcs=preprocessing_funcs, vectorizer=vectorizer, 
                      verbose=1, column='review_text', target='overall_rating')

df = pd.read_csv('../datasets/B2W-Reviews01.csv')

pipeline = pipeline.fit(df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords', 'remove_specific_phrases'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!

Preprocess --> remove_specific_phrases
Removing phrases from text...
Done!



Applying word2vec vectorizer...
Done!




In [2]:
pipeline

## Testing with a phrase (string):

In [5]:
pipeline.transform('Produto muito bom, gostei demais')


Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords', 'remove_specific_phrases'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!

Preprocess --> remove_specific_phrases
Removing phrases from text...
Done!



Applying word2vec vectorizer...
Done!




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,overall_rating
0,-0.495393,0.069200,-0.339179,-1.150147,0.384039,0.048096,0.334231,0.725881,0.529838,-0.419440,...,-0.258522,-0.397066,0.512122,0.428364,0.086858,-0.593617,-0.053689,-0.373857,0.140283,4
1,0.011293,0.280860,0.221437,-0.522343,0.210306,0.503963,0.409176,0.609754,-0.529044,-0.481166,...,0.297650,-0.464808,0.749755,0.210763,-0.015296,-0.197727,0.472738,-0.562463,-0.395864,4
2,-0.164217,0.448492,0.370639,0.097375,-0.349748,0.342414,0.583837,0.777439,-0.493861,-0.706168,...,0.144320,-0.163106,0.227909,0.255446,0.147902,-0.372484,0.098430,-0.062856,-0.086091,4
3,-0.393505,0.484699,-0.168094,-0.245215,0.208134,0.656831,0.911300,0.404897,-0.448410,-0.512638,...,0.840570,-0.667037,0.261023,0.615252,0.231175,-0.159496,0.243101,-0.106011,-0.049069,4
4,-0.223330,0.700668,0.100287,0.014698,-0.042949,0.013154,0.585475,0.899310,-0.076746,-0.452328,...,0.041951,-0.370409,0.049185,0.318019,0.100965,-0.573394,0.329070,0.125443,0.403637,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128987,-0.637712,0.655857,0.145270,-0.417383,-0.229299,0.253417,0.805816,0.285061,-0.047969,-0.443910,...,-0.035383,-0.416738,0.298230,0.108174,0.628186,-0.190050,-0.172946,-0.383705,-0.324342,5
128988,-0.406448,0.237701,-0.435656,-0.358214,-0.264523,0.275055,0.637916,1.156187,-0.048185,-0.138278,...,0.191495,0.411113,0.226898,0.831734,-0.216767,-0.387393,0.921502,-0.017488,0.272637,5
128989,-0.948896,0.532830,-0.417644,-0.360181,0.999682,0.563821,-0.593463,0.789793,0.302131,-0.148779,...,0.393547,0.725575,0.636854,1.269922,-0.632139,-0.940542,1.669040,-0.007492,0.258987,4
128990,-0.519471,0.719077,-0.278444,0.069418,0.468856,0.319932,1.191733,1.159829,-0.300604,-0.541649,...,-0.054234,-0.422115,0.009589,0.691641,0.269204,-0.314029,0.552153,-0.259369,0.482899,1


## Testing with a list:

In [11]:
pipeline.transform(
    ['Produto muito bom, gostei demais', 
     'Odiei péssimo atendimento e oferta']
)


Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords', 'remove_specific_phrases'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!

Preprocess --> remove_specific_phrases
Removing phrases from text...
Done!



Applying tf-idf vectorizer...
Done!




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
result

NameError: name 'result' is not defined

## Testing with a DataFrame:

In [6]:
df = pd.DataFrame(
    {'review_text': ['Produto muito bom, gostei demais', 
                     'Odiei péssimo atendimento e oferta']
    }
)

pipeline.transform(df)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.906216,0.403895,-0.702325,-0.147299,0.303203,0.527698,0.421332,1.18418,-1.562565,-0.779761,...,-0.877641,1.716837,0.242468,0.398737,1.406891,0.631482,-1.705007,0.071202,0.964499,0.564619
1,0.245077,0.799019,0.012044,0.160703,0.845049,-0.330357,0.273403,0.887222,0.029977,0.297038,...,0.072823,-0.348972,0.456879,0.896156,0.511712,0.718133,-0.33694,-0.198802,0.355038,0.055089


### Using the Pipeline methods separately:

In [7]:
only_preprocess = pipeline.named_steps['preprocess'].transform('Produto muito bom, gostei demais')
display(only_preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review_text,prep
0,"Produto muito bom, gostei demais",produto muito bom gostei demais


In [8]:
only_vectorize = pipeline.named_steps['vectorize'].transform(only_preprocess)
only_vectorize

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.906216,0.403895,-0.702325,-0.147299,0.303203,0.527698,0.421332,1.18418,-1.562565,-0.779761,...,-0.877641,1.716837,0.242468,0.398737,1.406891,0.631482,-1.705007,0.071202,0.964499,0.564619


### Using vector-specific functions:
* Here, we are using a Word2Vec internal function.

In [9]:
vector_pipeline = pipeline.named_steps['vectorize'].vector
similar_words = vector_pipeline.wv.most_similar('comprei')
print(similar_words)

[('adquiri', 0.7346892952919006), ('compramos', 0.6598737239837646), ('encomendei', 0.6003533005714417), ('escolhi', 0.5846759080886841), ('compro', 0.5716974139213562), ('pedi', 0.5635183453559875), ('dei', 0.5160222053527832), ('comprado', 0.5080247521400452), ('suqueiras', 0.5001016855239868), ('rezando', 0.4896581470966339)]
