# Text Mining Example - TextMiner

## Installation

In [1]:
## Install Library
#!pip install textmining-module==0.1.4
#https://pypi.org/project/textmining-module/

### Package

In [2]:
## Load up Library
from textmining_module import TextMiner
from textmining_module import KeywordsExtractor


### Data Exploration

In [3]:
## Sample Data

import pandas as pd
import numpy as np

data = pd.read_csv("C:/Users/kwadw.DESKTOP-T9BSTPE/OneDrive/Desktop/My Python Packages/backwardsreg/sephora_reviews.csv")


In [4]:
## Top rows of data
data.head(10)

Unnamed: 0,url,name,pid,brand,breadcrumbs,user_name,review_title,rating,review_text,helpful_count,images,reviewed_at,uniq_id,scraped_at
0,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03
1,https://www.sephora.com/ca/en/product/full-cov...,Full Cover Concealer,P151107,MAKE UP FOR EVER,"Makeup, Face, Concealer",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03
2,https://www.sephora.com/ca/en/product/pat-mcgr...,Dark Star Volumizing Mascara,P460745,PAT McGRATH LABS,"Makeup, Eye, Mascara",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03
3,https://www.sephora.com/ca/en/product/teint-co...,Teint Couture Everwear 24H Wear & Comfort,P443301,Givenchy,"Makeup, Face, Foundation",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03
4,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03
5,https://www.sephora.com/ca/en/product/versace-...,Versace Man Eau Fraiche,P169301,Versace,"Fragrance, Men, Cologne",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03
6,https://www.sephora.com/ca/en/product/full-cov...,Full Cover Concealer,P151107,MAKE UP FOR EVER,"Makeup, Face, Concealer",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03
7,https://www.sephora.com/ca/en/product/pat-mcgr...,Dark Star Volumizing Mascara,P460745,PAT McGRATH LABS,"Makeup, Eye, Mascara",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03
8,https://www.sephora.com/ca/en/product/teint-co...,Teint Couture Everwear 24H Wear & Comfort,P443301,Givenchy,"Makeup, Face, Foundation",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03
9,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,ErinEngraved,Like Cigarette Smoke,2,This smells great in the container. I lot this...,0.714286,,2021-05-15,e3343544-6a6d-5e9b-b4de-e0f4bb12fc1f,2022-08-03


In [5]:
## Adding a target column for reviews 
data['status'] = data['rating'].apply(lambda x: "Perfect" if x == 5 else "Not-Perfect")

### Format of Code - TextMiner

``` bash

text_modeling = TextMiner(data, 
                 comment_variable='Cleaned_text_column', target_variable='Target_column', 
                 strata_variable='LoB_column', keywords_variable=None, clean_words=None,  # data
                 search_mode='micro', n=3, top=1, stop_words=None, truncation_words=None, truncation_mode='right', # YAKE
                 fpg_min_support=1E-3, keep_strongest_association=False, removeOutersection=False, # FPG
                 req_len_complexity=False, req_importance_score=False, # Random Forest
                 verbose=True, preprocessing_only=False) # class use
```

Here's a detailed look at each of its arguments:

-   Required 1st argument : (`pandas` dataframe) of dataset;
-   `comment_variable` : (str) name of the comment variable in `pandas`
    dataframe;
-   `target_variable` : (str) name of the target variable in `pandas`
    dataframe;
-   `truncation_words` : (str list) words where a split occur to
    truncate a message to the left/right - i.e. if french copy
    before/after an english message;
-   `truncation_mode` : (str) {'right' : remove rhs of message at
    truncation_word, 'left' : remove lhs of message at truncation_word};
-   `preprocessing_only` : (bool) if True, only clean (opt.), format,
    stratify (opt.) and truncate (opt.) given dataset;
-   `verbose` : (bool) if True, show a progress bar.
-   `strata_variable` : (str) name of the strata variable in `pandas`
    dataframe, for a stratified analysis - i.e. break down by LoB;
-   `req_len_complexity` : (bool) if True, include message length
    quartiles in analysis as a new qualitative attribute;
-   `removeOutersection` : (bool) if True, exclude keywords that contain
    other fetched keywords;
-   `search_mode` : (str) {'macro' : (for each strata) concatenate all
    rows in one chunk before extracting keywords, 'micro' or 'macro' : extract
    keywords row-wise}
-   `n` : (int) maximal number of grams (words excluding `stop_words`)
    that can form a `keyword`;
-   `top` : (int) how many n-grams to fetch;
-   `stop_words` : (str list) words to disregard in generation of
    n-grams;
-   `fpg_min_support` : (float) minimal support for FP Growth - try
    higher value if FPG takes too long;
-   `keep_strongest_association` : (bool) filter One Hot Data to keep
    highest supported bits before fetching association.
-   `req_importance_score`: (bool) find importance score for all bags of
    relevant keywords

In [6]:
# Example usage

## Prepare Text Dataset
Cleaner = TextMiner(data, comment_variable='review_text', target_variable='status',
                       truncation_words=None, truncation_mode='right',
                       preprocessing_only=True, verbose=True)

data['Cleaned_review_text'] = Cleaner.reqCleanedData()['review_text']

FloatProgress(value=0.0, description='Init...', style=ProgressStyle(bar_color='#00AEFF', description_width='in…

In [7]:
## View data with clean text
data.head(10)

Unnamed: 0,url,name,pid,brand,breadcrumbs,user_name,review_title,rating,review_text,helpful_count,images,reviewed_at,uniq_id,scraped_at,status,Cleaned_review_text
0,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03,Perfect,love the scent of this candle and the beautifu...
1,https://www.sephora.com/ca/en/product/full-cov...,Full Cover Concealer,P151107,MAKE UP FOR EVER,"Makeup, Face, Concealer",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03,Perfect,love the scent of this candle and the beautifu...
2,https://www.sephora.com/ca/en/product/pat-mcgr...,Dark Star Volumizing Mascara,P460745,PAT McGRATH LABS,"Makeup, Eye, Mascara",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03,Perfect,love the scent of this candle and the beautifu...
3,https://www.sephora.com/ca/en/product/teint-co...,Teint Couture Everwear 24H Wear & Comfort,P443301,Givenchy,"Makeup, Face, Foundation",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03,Perfect,love the scent of this candle and the beautifu...
4,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03,Not-Perfect,"lovely scent in the jar, kind of smells like b..."
5,https://www.sephora.com/ca/en/product/versace-...,Versace Man Eau Fraiche,P169301,Versace,"Fragrance, Men, Cologne",wildflower80,Worth the purchasr,5,Love the scent of this candle and the beautifu...,1.0,,2021-09-01,e2bdc95f-a007-59e7-a615-662f5a1bc965,2022-08-03,Perfect,love the scent of this candle and the beautifu...
6,https://www.sephora.com/ca/en/product/full-cov...,Full Cover Concealer,P151107,MAKE UP FOR EVER,"Makeup, Face, Concealer",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03,Not-Perfect,"lovely scent in the jar, kind of smells like b..."
7,https://www.sephora.com/ca/en/product/pat-mcgr...,Dark Star Volumizing Mascara,P460745,PAT McGRATH LABS,"Makeup, Eye, Mascara",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03,Not-Perfect,"lovely scent in the jar, kind of smells like b..."
8,https://www.sephora.com/ca/en/product/teint-co...,Teint Couture Everwear 24H Wear & Comfort,P443301,Givenchy,"Makeup, Face, Foundation",regina881,,2,"Lovely scent in the jar, kind of smells like b...",0.5,,2021-06-12,24464c43-6264-5030-91e1-39687c5a4d53,2022-08-03,Not-Perfect,"lovely scent in the jar, kind of smells like b..."
9,https://www.sephora.com/ca/en/product/k-by-dol...,K by Dolce & Gabbana,P449367,DOLCE & GABBANA,Men,ErinEngraved,Like Cigarette Smoke,2,This smells great in the container. I lot this...,0.714286,,2021-05-15,e3343544-6a6d-5e9b-b4de-e0f4bb12fc1f,2022-08-03,Not-Perfect,this smells great in the container. i lot this...


In [8]:
## Stopwords to be removed
path_to_stopwords = "C:/Users/kwadw.DESKTOP-T9BSTPE/OneDrive/Desktop/My Python Packages/backwardsreg/stop_keywords.txt"
stopwords = open(path_to_stopwords, 'r').read().split('\n')

## Mining Stage
text_modeling = text_modeling = TextMiner(data, 
                 comment_variable='Cleaned_review_text', target_variable='status', 
                 strata_variable='brand', keywords_variable=None, clean_words=None,  # data
                 search_mode='micro', n=3, top=3, stop_words=stopwords, truncation_words=None, truncation_mode='right', # YAKE
                 fpg_min_support=1E-3, keep_strongest_association=True, removeOutersection=False, # FPG
                 req_len_complexity=False, req_importance_score=True, # Random Forest
                 verbose=True, preprocessing_only=False) # class use

FloatProgress(value=0.0, description='Init...', style=ProgressStyle(bar_color='#00AEFF', description_width='in…

In [9]:
## We can also request the strata of the data.

text_modeling.reqUniqueStratas()

['DOLCE & GABBANA',
 'MAKE UP FOR EVER',
 'PAT McGRATH LABS',
 'Givenchy',
 'Versace',
 'LAWLESS',
 'IT Cosmetics',
 'Tower 28 Beauty',
 'HERMÈS',
 'SEPHORA COLLECTION',
 'The INKEY List',
 'Laura Mercier',
 'GUERLAIN',
 'shu uemura',
 'Josie Maran',
 'Kate Somerville',
 'Anastasia Beverly Hills',
 'Juice Beauty',
 'Montblanc',
 'MILK MAKEUP',
 'Danessa Myricks Beauty',
 'CLINIQUE',
 'Dr. Dennis Gross Skincare',
 "Kiehl's Since 1851",
 'Acqua di Parma',
 'Lancôme',
 'Too Faced',
 'Dyson',
 'Living Proof',
 'Glow Recipe',
 'Issey Miyake',
 'Rare Beauty by Selena Gomez',
 'La Mer',
 'Bobbi Brown',
 'HUDA BEAUTY',
 'Jo Malone London',
 'KVD Beauty',
 'BeautyBio',
 'NEST New York',
 'By Rosie Jane',
 'fresh',
 'MAC Cosmetics',
 'Mizani',
 'tarte',
 'Comptoir Sud Pacifique',
 'StriVectin',
 'NUDESTIX',
 'Summer Fridays']

In [10]:
## We can also request the targets of the data.

text_modeling.reqUniqueTargets()

['Perfect', 'Not-Perfect']

In [11]:
## We can also request the keywords extracted from the data per strata

text_modeling.reqYAKEKeywords()['DOLCE & GABBANA']

Unnamed: 0,Cleaned_review_text,status,keywords_yake
0,love the scent of this candle and the beautifu...,Perfect,"[candle, beautiful jar, top makes, scent, love..."
4,"lovely scent in the jar, kind of smells like b...",Not-Perfect,"[burnt chocolate, lovely scent, kind of smells..."
9,this smells great in the container. i lot this...,Not-Perfect,"[smells great, container, smells, scent]"
14,this is my favorite scent from voluspa so far ...,Perfect,"[favorite scent, coconut vanilla, tahtian coco..."
19,smells good but isn’t the strongest fragrance ...,Not-Perfect,"[smells good, candle, n’t the strongest, smell..."
24,"i was very excited to see a new, fresh scent f...",Not-Perfect,"[candle, perfect, apple blue clover, scent]"
51,this is really perfect for spring. it’s very r...,Perfect,"[perfect for spring, perfect, spring, scent]"
59,"love this scent, it is surprisingly refreshing...",Perfect,"[perfect for spring, candle, container, surpri..."
67,lovely scent! smells like spring time and not ...,Perfect,"[lovely scent, spring, lovely, smells, scent]"


In [12]:
## Similarity Matrix 
## Note it compares keywords to keywords using its respective index.
## Example: a keyword on index 0 will be compare to a keyword of let's say index 5
text_modeling.reqSimilarityMatrix()['DOLCE & GABBANA']

Unnamed: 0,0,4,9,14,19,24,51,59,67
0,1.0,0.090909,0.125,0.125,0.111111,0.285714,0.125,0.166667,0.111111
4,0.090909,1.0,0.222222,0.1,0.090909,0.1,0.1,0.142857,0.5
9,0.125,0.222222,1.0,0.142857,0.125,0.142857,0.142857,0.181818,0.285714
14,0.125,0.1,0.142857,1.0,0.0,0.142857,0.142857,0.083333,0.125
19,0.111111,0.090909,0.125,0.0,1.0,0.125,0.0,0.076923,0.111111
24,0.285714,0.1,0.142857,0.142857,0.125,1.0,0.333333,0.3,0.125
51,0.125,0.1,0.142857,0.142857,0.0,0.333333,1.0,0.444444,0.285714
59,0.166667,0.142857,0.181818,0.083333,0.076923,0.3,0.444444,1.0,0.272727
67,0.111111,0.5,0.285714,0.125,0.111111,0.125,0.285714,0.272727,1.0


In [13]:
## Clusterize
cluster_observations, cluster_keywords = text_modeling.clusterize(treshold=0.7)

In [14]:
cluster_observations['DOLCE & GABBANA']

Unnamed: 0,Cluster
0,4
4,1
9,2
14,5
19,6
24,3
51,3
59,3
67,1


In [15]:
cluster_keywords['DOLCE & GABBANA']

Unnamed: 0,Cluster
burnt chocolate,4
lovely scent,4
smells good,3
perfect for spring,6
smells great,5
candle,7
container,5
surprisingly refreshing,6
kind of smells,4
perfect,6


In [16]:
## Attaching cluster to prepared dataframe
text_modeling.reqCleanedData()['DOLCE & GABBANA']

Unnamed: 0,Cleaned_review_text,status,keywords_yake,Cluster
0,love the scent of this candle and the beautifu...,Perfect,"[candle, beautiful jar, top makes, scent, love...",4
4,"lovely scent in the jar, kind of smells like b...",Not-Perfect,"[burnt chocolate, lovely scent, kind of smells...",1
9,this smells great in the container. i lot this...,Not-Perfect,"[smells great, container, smells, scent]",2
14,this is my favorite scent from voluspa so far ...,Perfect,"[favorite scent, coconut vanilla, tahtian coco...",5
19,smells good but isn’t the strongest fragrance ...,Not-Perfect,"[smells good, candle, n’t the strongest, smell...",6
24,"i was very excited to see a new, fresh scent f...",Not-Perfect,"[candle, perfect, apple blue clover, scent]",3
51,this is really perfect for spring. it’s very r...,Perfect,"[perfect for spring, perfect, spring, scent]",3
59,"love this scent, it is surprisingly refreshing...",Perfect,"[perfect for spring, candle, container, surpri...",3
67,lovely scent! smells like spring time and not ...,Perfect,"[lovely scent, spring, lovely, smells, scent]",1


In [17]:
## Scoring texts or keywords 
## NB: Remember to set req_importance_score = True
#text_modeling_fit = text_modeling.fit(n_round=10, n_fold=2, train_ratio=0.7, optim_metric='accuracy', n_jobs=1, skl_verbose=0, verbose=False)

text_modeling = TextMiner(data, comment_variable='Cleaned_review_text', target_variable='status', keywords_variable=None, strata_variable=None,
                        search_mode='micro', n=1, top=1, stop_words=stopwords,
                        fpg_min_support=1E-3, keep_strongest_association=False, removeOutersection=False,
                        preprocessing_only=False, req_len_complexity=False, req_importance_score= True, verbose=True)

FloatProgress(value=0.0, description='Init...', style=ProgressStyle(bar_color='#00AEFF', description_width='in…

In [18]:
## Fitting model
model = text_modeling.fit(k_neighbors=3, n_round=10, n_fold=2, train_ratio=0.8, optim_metric='accuracy',n_jobs=1, skl_verbose=0, verbose=False)

In [19]:
mdi_importances_df = pd.DataFrame(text_modeling.mdi_importances.items(), columns=['Feature', 'Importance'])
mdi_importances_df

Unnamed: 0,Feature,Importance
0,lovely,0.097086
1,smells,0.249514
2,vanilla,0.061522
3,candle,0.095939
4,love,0.106091
5,container,0.125884
6,scent,0.064893
7,perfect,0.19907


In [20]:
## Permutation
perm_importances_df = pd.DataFrame(text_modeling.perm_importances.items(), columns=['Feature', 'Importance'])
perm_importances_df

Unnamed: 0,Feature,Importance
0,lovely,0.075472
1,smells,0.235849
2,vanilla,0.00566
3,candle,0.041509
4,love,0.050943
5,container,0.0
6,scent,0.0
7,perfect,0.126415
