# Scrap Using snscrape (cli)

In [None]:
# Run the pip install command below if you don't already have the library
# !pip install git+https://github.com/JustAnotherArchivist/snscrape.git

# Run the below command if you don't already have Pandas
# !pip install pandas

# Imports
import os
import pandas as pd

# Scrap sport data

In [None]:
# Setting variables to be used in format string command below
tweet_count = 5000
text_query = "sport"
since_date = "1967-11-01"
until_date = "2022-11-01"

# Using OS library to call CLI commands in Python
os.system('snscrape --jsonl --max-results {} --since {} twitter-search "{} until:{}"> sport.json'.format(tweet_count, since_date, text_query, until_date))

In [None]:
# Reads the json generated from the CLI command above and creates a pandas dataframe
tweets_df1 = pd.read_json('sport.json', lines=True)

# Displays first 5 entries from dataframe
tweets_df1.head()

In [None]:
# Export dataframe into a CSV
tweets_df1.to_csv('sports.csv', sep=',', index=False)

# Scrap food data

In [None]:
# Setting variables to be used in format string command below
tweet_count = 5000
text_query = "food"
since_date = "1967-11-01"
until_date = "2022-11-01"

# Using OS library to call CLI commands in Python
os.system('snscrape --jsonl --max-results {} --since {} twitter-search "{} until:{}"> food.json'.format(tweet_count, since_date, text_query, until_date))

In [None]:
# Reads the json generated from the CLI command above and creates a pandas dataframe
tweets_df2 = pd.read_json('food.json', lines=True)

# Displays first 5 entries from dataframe
tweets_df2.head()

In [None]:
# Export dataframe into a CSV
tweets_df2.to_csv('food.csv', sep=',', index=False)

# Scrap car data

In [None]:
# Setting variables to be used in format string command below
tweet_count = 5000
text_query = "car"
since_date = "2021-11-01"
until_date = "2022-11-01"

# Using OS library to call CLI commands in Python
os.system('snscrape --jsonl --max-results {} --since {} twitter-search "{} until:{}"> car.json'.format(tweet_count, since_date, text_query, until_date))

In [None]:
# Reads the json generated from the CLI command above and creates a pandas dataframe
tweets_df3 = pd.read_json('car.json', lines=True)

# Displays first 5 entries from dataframe
tweets_df3.head()

In [None]:
# Export dataframe into a CSV
tweets_df3.to_csv('car.csv', sep=',', index=False)

## Step 1: Load the dataset

In [1]:
'''
Load the dataset from the CSV and save it to 'data_text'
'''
import pandas as pd

data = pd.read_csv('car.csv', error_bad_lines=False);

# We only need the Headlines text column from the data
data_text = data[:5000][['Text']];

data_text['index'] = data_text.index

documents = data_text

In [2]:
'''
Load the dataset from the CSV and save it to 'data_text1'
'''
import pandas as pd

data1 = pd.read_csv('food.csv',
                 lineterminator='\n')

# We only need the Headlines text column from the data
data_text1 = data1[:5000][['Text']];



data_text1['index'] = data_text1.index



In [3]:
'''
Load the dataset from the CSV and save it to 'data_text2'
'''
import pandas as pd

data2 = pd.read_csv('sports.csv',
                 lineterminator='\n')

# We only need the Headlines text column from the data
data_text2 = data2[:5000][['Text']];

frames = [data_text, data_text1, data_text2]

data_text2['index'] = data_text2.index

documents = pd.concat(frames)

Let's look at the dataset:

In [4]:
'''
Get the total number of documents
'''
print(len(documents))

15000


In [5]:
documents[:5]

Unnamed: 0,Text,index
0,My '06 Chevy Silverado 1500HD with 6.0 liter ...,0
1,"I have owned 5 Silverado's since 1999, would ...",1
2,I am a line driver for a local trucking compa...,2
3,We purchased this thruck to pull a 33 ft Amer...,3
4,This has been the best truck I've ever owned....,4


## Step 2: Data Preprocessing ##
For example, tokenization, stopwords removal, lemmatized and stemmed words


In [6]:
'''
Loading Gensim and nltk libraries
'''
# !pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk

In [7]:
#nltk.download('wordnet')

In [8]:
'''
Function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    
    result = []
    
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            
            # TODO: Apply lemmatize_stemming() on the token, then add to the results list
            result.append(lemmatize_stemming(token))
    
    return result



In [9]:
'''
Preview a document after preprocessing
'''
stemmer = SnowballStemmer("english")
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")

words = []

for word in doc_sample.split(' '):
    words.append(word)
    
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['', 'Corvette', 'performance', 'with', 'a', 'cavaliers', '\rquality.', '', 'a/c', 'went', 'out', 'at', '37K', 'miles,', '', '\rthanks', 'GM,', '', 'power', 'windows', 'roll', 'up', 'and', '\rdown', 'like', 'molasses.', '', '', 'motors', 'replaced', '\rtwice.', '', '', 'ttops', 'are', 'starting', 'to', 'leak.', '', '\rradio', 'speakers', 'blew.', '\r', 'handles', 'like', 'a', 'dream,', '', 'fast', 'as', 'hell.', '', '\rbrakes', 'stop', 'on', 'a', 'dime.\r', 'i', 'love', 'this', 'car', 'lol']


Tokenized and lemmatized document: 
['corvett', 'perform', 'cavali', 'qualiti', 'go', 'mile', 'thank', 'power', 'window', 'roll', 'like', 'molass', 'motor', 'replac', 'twice', 'ttop', 'start', 'leak', 'radio', 'speaker', 'blow', 'handl', 'like', 'dream', 'fast', 'hell', 'brake', 'stop', 'dime', 'love']


In [10]:
documents

Unnamed: 0,Text,index
0,My '06 Chevy Silverado 1500HD with 6.0 liter ...,0
1,"I have owned 5 Silverado's since 1999, would ...",1
2,I am a line driver for a local trucking compa...,2
3,We purchased this thruck to pull a 33 ft Amer...,3
4,This has been the best truck I've ever owned....,4
...,...,...
4995,Smaller than I expected which is my fault but ...,4995
4996,Bought this for my 5 year old grandson and he ...,4996
4997,Excellent. Very good material.,4997
4998,this product made me a daredevil. removed my f...,4998


Preprocess all the text

**Note**: This may take a few minutes

In [11]:
'''
Preprocess all the texts, saving the list of results as 'processed_docs'
'''
processed_docs = documents['Text'].map(preprocess)

In [12]:
'''
Preview 'processed_docs'
'''
processed_docs[:10]

0    [chevi, silverado, liter, awesom, truck, wheel...
1    [own, silverado, consid, truck, allison, trans...
2    [line, driver, local, truck, compani, yard, wa...
3    [purchas, thruck, pull, americamp, trailer, pl...
4    [best, truck, own, alaska, florida, tow, trave...
5    [purchas, truck, truck, year, camper, haulin, ...
6    [buy, truck, februari, year, mile, mile, run, ...
7    [haul, pallet, pave, stone, sweat, tow, packag...
8    [recent, purchas, truck, know, test, drive, go...
9    [buy, truck, septemb, tow, travel, trailer, pl...
Name: Text, dtype: object

## Step 3.1: Bag of words on the dataset

In [13]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [14]:
'''
Checking dictionary created
'''
count = 0

for k, v in dictionary.iteritems():
    
    print(k, v)
    
    count += 1
    
    if count > 10:
        break

0 awesom
1 camper
2 chevi
3 countri
4 economi
5 fall
6 fuel
7 liter
8 long
9 mile
10 mileag


** Gensim filter_extremes **

[`filter_extremes(no_below=5, no_above=0.5, keep_n=100000)`](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes)

Filter out tokens that appear in

* less than no_below documents (absolute number) or
* more than no_above documents (fraction of total corpus size, not absolute number).
* after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).

In [15]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 5 times
- words appearing in more than 50% of all documents
'''
# TODO: apply dictionary.filter_extremes() with the parameters mentioned above
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

** Gensim doc2bow **

[`doc2bow(document)`](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2bow)

* Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [16]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [17]:
'''
Checking Bag of Words corpus for our sample document --> (token_id, token_count)
'''
bow_corpus[document_num]

[(9, 1),
 (22, 1),
 (36, 1),
 (43, 2),
 (47, 1),
 (63, 1),
 (84, 1),
 (97, 1),
 (101, 1),
 (117, 1),
 (124, 1),
 (165, 1),
 (189, 1),
 (224, 1),
 (262, 1),
 (382, 1),
 (409, 1),
 (464, 1),
 (513, 1),
 (607, 1),
 (741, 1),
 (754, 1),
 (815, 1),
 (994, 1),
 (1180, 1),
 (1704, 1),
 (2796, 1),
 (3480, 1)]

In [18]:
'''
Preview Bag-of-words for our sample preprocessed document
'''
# Here document_num is document number 4310 which we have checked in Step 2
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 9 ("mile") appears 1 time.
Word 22 ("power") appears 1 time.
Word 36 ("dream") appears 1 time.
Word 43 ("like") appears 2 time.
Word 47 ("love") appears 1 time.
Word 63 ("perform") appears 1 time.
Word 84 ("start") appears 1 time.
Word 97 ("motor") appears 1 time.
Word 101 ("replac") appears 1 time.
Word 117 ("hell") appears 1 time.
Word 124 ("thank") appears 1 time.
Word 165 ("go") appears 1 time.
Word 189 ("brake") appears 1 time.
Word 224 ("handl") appears 1 time.
Word 262 ("blow") appears 1 time.
Word 382 ("fast") appears 1 time.
Word 409 ("leak") appears 1 time.
Word 464 ("corvett") appears 1 time.
Word 513 ("twice") appears 1 time.
Word 607 ("qualiti") appears 1 time.
Word 741 ("stop") appears 1 time.
Word 754 ("dime") appears 1 time.
Word 815 ("radio") appears 1 time.
Word 994 ("roll") appears 1 time.
Word 1180 ("window") appears 1 time.
Word 1704 ("speaker") appears 1 time.
Word 2796 ("cavali") appears 1 time.
Word 3480 ("molass") appears 1 time.


## Step 3.2: TF-IDF on our document set ##

In [19]:
'''
Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
'''
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel<num_docs=15000, num_nnz=327505>


In [20]:
'''
Apply transformation to the entire corpus and call it 'corpus_tfidf'
'''
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[1])

[(0, 0.138133469731731), (11, 0.23409870433971797), (13, 0.44015955521596106), (14, 0.1395938935085208), (16, 0.584844002494709), (17, 0.15750453475695322), (18, 0.26891911964944715), (19, 0.1703065596103802), (20, 0.11740536660872368), (21, 0.11665888454858028), (22, 0.10679116858701652), (23, 0.08931462024018134), (24, 0.14249660210251763), (25, 0.1456266410619934), (26, 0.2423297799076739), (27, 0.3125814392663522)]


In [21]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''
from pprint import pprint

for doc in corpus_tfidf:
    
    pprint(doc)
    
    break

[(0, 0.19078924041872244),
 (1, 0.3511211447021249),
 (2, 0.17841896683468647),
 (3, 0.28254428149856087),
 (4, 0.2427502310046791),
 (5, 0.2511145439465789),
 (6, 0.19464950977579026),
 (7, 0.27509300844171564),
 (8, 0.16011651229993046),
 (9, 0.1294591660932731),
 (10, 0.16574250447273267),
 (11, 0.32333593060916294),
 (12, 0.36561089906305133),
 (13, 0.15198653042920923),
 (14, 0.3856127405082445),
 (15, 0.1149605557115203)]


## Step 4.1: Running LDA using Bag of Words ##

We are going for 3 topics in the document corpus.

Number of requested latent themes to be retrieved from the training corpus is indicated by the variable **num topics**.

Word ids (integers) are mapped to words in **id2word** (strings). It is used for topic printing, debugging, and determining the vocabulary size.

The quantity of additional processes to use for parallelization is **workers**. use all of the CPU cores by default.

The number of training passes through the corpus is **passes**.

In [22]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore  2 and 3 ,4,5passes
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=3, 
                                       id2word = dictionary, 
                                       passes = 4, 
                                       workers=2)

In [23]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.019*"drive" + 0.012*"mile" + 0.011*"problem" + 0.010*"great" + 0.010*"year" + 0.009*"buy" + 0.008*"replac" + 0.008*"truck" + 0.008*"like" + 0.007*"time"


Topic: 1 
Words: 0.058*"band" + 0.024*"great" + 0.022*"resist" + 0.019*"product" + 0.016*"work" + 0.012*"exercis" + 0.012*"good" + 0.011*"handl" + 0.011*"qualiti" + 0.011*"like"


Topic: 2 
Words: 0.017*"like" + 0.015*"good" + 0.014*"tast" + 0.012*"product" + 0.012*"flavor" + 0.012*"love" + 0.010*"great" + 0.009*"coffe" + 0.008*"food" + 0.007*"chip"




# Save lda model

In [24]:
from gensim.test.utils import datapath



#saving model to disk.

temp_file = datapath(r"C:\Users\60169\Documents\Unsupervised-Text-Clustering\lda_model")

lda_model.save(temp_file)



#loading model from disk

from gensim import  models

lda = models.ldamodel.LdaModel.load(temp_file)

## Step 4.2 Running LDA using TF-IDF ##

In [26]:
'''
Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
'''

# 3 and 4 passes
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=3, 
                                             id2word = dictionary, 
                                             passes = 5, 
                                             workers=2)

In [27]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.010*"love" + 0.009*"tast" + 0.008*"flavor" + 0.007*"coffe" + 0.007*"chip" + 0.006*"like" + 0.006*"food" + 0.006*"good" + 0.005*"product" + 0.005*"chocol"


Topic: 1 Word: 0.010*"drive" + 0.008*"truck" + 0.007*"mile" + 0.006*"problem" + 0.005*"look" + 0.005*"year" + 0.005*"great" + 0.005*"power" + 0.004*"vehicl" + 0.004*"replac"


Topic: 2 Word: 0.030*"band" + 0.015*"resist" + 0.013*"product" + 0.013*"great" + 0.012*"work" + 0.010*"workout" + 0.010*"qualiti" + 0.009*"exercis" + 0.008*"good" + 0.008*"easi"




# Save lda model TF-IDF

In [28]:
from gensim.test.utils import datapath

#saving model to disk.

temp_file1 = datapath(r"C:\Users\60169\Documents\Unsupervised-Text-Clustering\lda_model_tfidf")

lda_model_tfidf.save(temp_file1)



#loading model from disk

from gensim import  models

lda = models.ldamodel.LdaModel.load(temp_file)

## Step 5.1: Performance evaluation by classifying sample document using LDA Bag of Words model

We will check to see where our test document would be classified. 

In [29]:
'''
Text of sample document 4310
'''
processed_docs[4310]

4310    [corvett, perform, cavali, qualiti, go, mile, ...
4310    [regular, purchas, item, bulk, love, dairi, fr...
4310                                              [great]
Name: Text, dtype: object

In [30]:
'''
Check which topic our test document belongs to using the LDA Bag of Words model.
'''
document_num = 4310
# Our test document is document number 4310

for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9149332046508789	 
Topic: 0.019*"drive" + 0.012*"mile" + 0.011*"problem" + 0.010*"great" + 0.010*"year" + 0.009*"buy" + 0.008*"replac" + 0.008*"truck" + 0.008*"like" + 0.007*"time"

Score: 0.0727204903960228	 
Topic: 0.017*"like" + 0.015*"good" + 0.014*"tast" + 0.012*"product" + 0.012*"flavor" + 0.012*"love" + 0.010*"great" + 0.009*"coffe" + 0.008*"food" + 0.007*"chip"

Score: 0.012346294708549976	 
Topic: 0.058*"band" + 0.024*"great" + 0.022*"resist" + 0.019*"product" + 0.016*"work" + 0.012*"exercis" + 0.012*"good" + 0.011*"handl" + 0.011*"qualiti" + 0.011*"like"


### It has the highest probability (`x`) to be  part of the topic that we assigned as X, which is the accurate classification. ###

## Step 5.2: Performance evaluation by classifying sample document using LDA TF-IDF model

In [31]:
'''
Check which topic our test document belongs to using the LDA TF-IDF model.
'''
# Our test document is document number 4310
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9743115901947021	 
Topic: 0.010*"drive" + 0.008*"truck" + 0.007*"mile" + 0.006*"problem" + 0.005*"look" + 0.005*"year" + 0.005*"great" + 0.005*"power" + 0.004*"vehicl" + 0.004*"replac"

Score: 0.013053097762167454	 
Topic: 0.010*"love" + 0.009*"tast" + 0.008*"flavor" + 0.007*"coffe" + 0.007*"chip" + 0.006*"like" + 0.006*"food" + 0.006*"good" + 0.005*"product" + 0.005*"chocol"

Score: 0.012635283172130585	 
Topic: 0.030*"band" + 0.015*"resist" + 0.013*"product" + 0.013*"great" + 0.012*"work" + 0.010*"workout" + 0.010*"qualiti" + 0.009*"exercis" + 0.008*"good" + 0.008*"easi"


### It has the highest probability (`x%`) to be  part of the topic that we assigned as X. ###

## Step 6: Testing model on unseen document ##

In [36]:
unseen_document = "My Chevrolet Cavelier is tops in gas milage and would be an excellent car for a young person or old.  However, I think GM needs to improve on the sturdiness of the vehicle."

# Data preprocessingAbu likes every sport step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.815620481967926	 Topic: 0.019*"drive" + 0.012*"mile" + 0.011*"problem" + 0.010*"great" + 0.010*"year"
Score: 0.15214847028255463	 Topic: 0.058*"band" + 0.024*"great" + 0.022*"resist" + 0.019*"product" + 0.016*"work"
Score: 0.03223102167248726	 Topic: 0.017*"like" + 0.015*"good" + 0.014*"tast" + 0.012*"product" + 0.012*"flavor"


In [48]:
unseen_document = "I've been drinking International Coffee for 35 years now.  My elder sister drank Suisse Mocha when I went to her house, and the instant I hit adulthood, she offered me a cup. I really liked it, but later, my budget did not allow for it, so I tried making my own from a combination of cocoa mix and instant coffee."
# Data preprocessingAbu likes every sport step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8352952003479004	 Topic: 0.017*"like" + 0.015*"good" + 0.014*"tast" + 0.012*"product" + 0.012*"flavor"
Score: 0.1474548876285553	 Topic: 0.058*"band" + 0.024*"great" + 0.022*"resist" + 0.019*"product" + 0.016*"work"
Score: 0.017249947413802147	 Topic: 0.019*"drive" + 0.012*"mile" + 0.011*"problem" + 0.010*"great" + 0.010*"year"


# Recommendation Insurance System (Main Program)

- We will make recommendations to the user based on the comments that they posted.

In [11]:
'''
Loading Gensim and nltk libraries
'''
# !pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
from gensim.test.utils import datapath
from gensim import corpora
from gensim import  models
import pandas as pd
import time

#loading model and dictionary from disk
model_file = datapath(r"C:\\Users\\60169\\Documents\\FYP\\NoidAI\\NoidAI\\TopicClustering\\lda_model")
dict_file = datapath(r"C:\Users\60169\Documents\FYP\NoidAI\NoidAI\TopicClustering\\lda_model.id2word")

lda_model = models.ldamodel.LdaModel.load(model_file)
dictionary = corpora.Dictionary.load(dict_file)

#load lda model tfidf
model_file_tfidf = datapath(r"C:\\Users\\60169\\Documents\\FYP\\NoidAI\\NoidAI\\TopicClustering\\lda_model_tfidf")
dict_file_tfidf = datapath(r"C:\Users\60169\Documents\FYP\NoidAI\NoidAI\TopicClustering\\lda_model_tfidf.id2word")

lda_model_tfidf = models.ldamodel.LdaModel.load(model_file_tfidf)
dictionary_tfidf = corpora.Dictionary.load(dict_file_tfidf)

# NOTE: Path may differ in differeny machines, please configure accordingly.

'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    
    result = []
    
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            
            # TODO: Apply lemmatize_stemming() on the token, then add to the results list
            result.append(lemmatize_stemming(token))
    
    return result


def get_cluster(document):
    # Data preprocessingAbu likes every sport step for the unseen document
    bow_vector = dictionary.doc2bow(preprocess(document))
    cluster = lda_model[bow_vector]
    df_cluster = pd.DataFrame(cluster, columns = ['index', 'score'])
    max_score = df_cluster.iloc[df_cluster['score'].idxmax()] 
    document_topic=int(max_score['index'])
    
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
    # topic 0 - car, topic 1 - sports, topic 2 - food
    if document_topic == 0:
        print("\nThis comment belongs to topic: CAR\n\n")
    elif document_topic == 1:
        print("\nThis comment belongs to topic: SPORTS\n\n")
    else :
        print("\nThis comment belongs to topic: FOOD\n\n")
        
    return document_topic

def get_cluster_tfidf(document):
    bow_vector_tfidf = dictionary_tfidf.doc2bow(preprocess(document))
    cluster_tfidf = lda_model_tfidf[bow_vector_tfidf]
    df_cluster_tfidf = pd.DataFrame(cluster_tfidf, columns = ['index', 'score'])
    max_score_tfidf = df_cluster_tfidf.iloc[df_cluster_tfidf['score'].idxmax()] 
    topic_tfidf=int(max_score_tfidf['index'])
    
    for index, score in sorted(lda_model_tfidf[bow_vector_tfidf], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
    
    # topic 0: Food, topic 1: Car, topic 2: Sports
    if topic_tfidf == 0:
        print("\nThis comment belongs to topic: FOOD")
    elif topic_tfidf == 1:
        print("\nThis comment belongs to topic: CAR")
    else :
        print("\nThis comment belongs to topic: SPORTS")
        
    return topic_tfidf

stemmer = SnowballStemmer("english")

# Insurance Products
# 0 - Life Insurance
# 1 - Health Insurance
# 2 - Car Insurance
# 3 - Accident Insurance
insurance_product = ['Life Insurance','Health Insurance','Car Insurance', 'Accident Insurance']

In [13]:
def show_menu():
    print ("\nMachine Learning Based Algorithm in Identifying Potential Insurance Customer in Social Media with Text")
    print ("--------------------------------------------------------------------------------------------------------")
    print ("1) Input Text")
    print ("Q) Exit\n")
 
def menu():
    while True:
        show_menu()
        choice = input('Enter your choice: ').lower()
        comment_recommendation = []
        if choice == '1':
            comment = input("\nPlease enter a comment:\n")
            print("\nLDA Bag of Words model")
            print("-------------------")
            get_cluster(comment)
            print("LDA TF-IDF model")
            print("-------------------")
            comment_cluster = get_cluster_tfidf(comment)
            # Recommender System based on text
            # Car person
            if comment_cluster == 1:
                comment_recommendation.append(insurance_product[0])
                comment_recommendation.append(insurance_product[2])
                comment_recommendation.append(insurance_product[3])
            # Exercise person
            elif comment_cluster == 2:
                comment_recommendation.append(insurance_product[0])
                comment_recommendation.append(insurance_product[3])
            else :
                comment_recommendation.append(insurance_product[1]) 
            # Display the Recommendation
            print("\n                              Insurance Recommendation ")
            print("--------------------------------------------------------------------------------")
            if len(comment_recommendation) != 0:
                print("Based on the comment you posted, we recommend you purchase :")
                cmm_rec_str = ''.join(comment_recommendation)
                cmm_rec_str
                
                for i in range(0,len(comment_recommendation), 1):
                    print('{}. {}'.format(i+1, comment_recommendation[i]))
                    
            time.sleep(5)
        elif choice == 'q':
            return
        else:
            print(f'Not a correct choice: <{choice}>,try again')
 
if __name__ == '__main__':
    import cv2
    import numpy as np
 
    
    menu()


Machine Learning Based Algorithm in Identifying Potential Insurance Customer in Social Media with Text
--------------------------------------------------------------------------------------------------------
1) Input Text
Q) Exit

Enter your choice: 1

Please enter a comment:
Weights with the Junior Youth Academy , who doesn’t like learning how to hold a bar :)

LDA Bag of Words model
-------------------
Score: 0.8499441742897034	 Topic: 0.058*"band" + 0.024*"great" + 0.022*"resist" + 0.019*"product" + 0.016*"work"
Score: 0.0765596255660057	 Topic: 0.017*"like" + 0.015*"good" + 0.014*"tast" + 0.012*"product" + 0.012*"flavor"
Score: 0.07349622994661331	 Topic: 0.019*"drive" + 0.012*"mile" + 0.011*"problem" + 0.010*"great" + 0.010*"year"

This comment belongs to topic: SPORTS


LDA TF-IDF model
-------------------
Score: 0.8528741002082825	 Topic: 0.030*"band" + 0.015*"resist" + 0.013*"product" + 0.013*"great" + 0.012*"work"
Score: 0.07384490221738815	 Topic: 0.010*"love" + 0.009*"tast"