# Importing libraries

In [None]:
import nltk
from nltk import FreqDist
nltk.download('stopwords')

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth',200)
import numpy as np
import re
import spacy
import gensim
from gensim import corpora
import json


In [None]:
# Libraries for visualisation

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Reading data

In [None]:
df = pd.read_csv('reviews_data.csv',error_bad_lines=False)

In [None]:
df.head()

# Data(Text) preprocessing 

In this step, we will remove the punctuations, stopwords and normalize the reviews as much as possible. After every preprocessing step, it is a good practice to check the most frequent words in the data.

In [None]:
# function to plot most frequent terms
def freq_words(x, terms = 30):
  all_words = ' '.join([text for text in x])
  all_words = all_words.split()

  fdist = FreqDist(all_words)
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

  # selecting top 20 most frequent words
  d = words_df.nlargest(columns="count", n = terms) 
  plt.figure(figsize=(20,5))
  ax = sns.barplot(data=d, x= "word", y = "count")
  ax.set(ylabel = 'Count')
  plt.show()

In [None]:
freq_words(df['reviewText'])


 - Looking at the data above it's clear that the most commonly used words are 'the','and','to' and so forth.
 - Such words provide no context so need to remove these
 - It's good practise to keep checking after every step of pre-processing

## Removing punctuations

We also need to remove punctuations

In [None]:
df['reviewText'] = df['reviewText'].str.replace("[^a-zA-Z#]", " ")

## Removing stopwords

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [None]:
stop_words[0:10]

In [None]:
# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new


In [None]:
# remove short words (length < 3)
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [None]:
# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in df['reviewText']]

In [None]:
# make entire text lowercase
reviews = [r.lower() for r in reviews]

In [None]:
freq_words(reviews, 35)


 - We can see the more context words like 'car', 'batter', 'oil' have now become the most frequently appear words.
 - However words such 'the', 'this','they' still appear

## Lemmatisation

The process of reducing any given word to its base form thereby reducing multiple forms of a word to a single word

In [None]:
# !python -m spacy download en

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])


In [None]:
def lemmatization(texts, tags=['NOUN', 'ADJ']): # filter noun and adjective
       output = []
       for sent in texts:
             doc = nlp(" ".join(sent)) 
             output.append([token.lemma_ for token in doc if token.pos_ in tags])
       return output

## Tokenise reviews and then lemmatise the tokenised review

In [None]:
tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
print(tokenized_reviews[1])

In [None]:
reviews_2 = lemmatization(tokenized_reviews)
print(reviews_2[1]) # print lemmatized review

### We have not just lemmatized the words but also filtered only nouns and adjectives. Let’s de-tokenize the lemmatized reviews and plot the most common words.

In [None]:
reviews_3 = []
for i in range(len(reviews_2)):
    reviews_3.append(' '.join(reviews_2[i]))

df['reviews'] = reviews_3

freq_words(df['reviews'], 35)

# Building the LDA model

We will start by creating the term dictionary of our corpus, where every unique term is assigned an index



In [None]:
dictionary = corpora.Dictionary(reviews_2)


Then we will convert the list of reviews (reviews_2) into a Document Term Matrix using the dictionary prepared above.



In [None]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews_2]


### Creating the object for LDA model using gensim library


In [None]:
LDA = gensim.models.ldamodel.LdaModel


In [None]:
# Build LDA model

In [None]:
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=7, random_state=100,
                chunksize=1000, passes=50)

In [None]:
lda_model.print_topics()


# Topics visualisation

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis