# Topic Modelling on Cars Reviews

In [1]:
!git clone https://github.com/ketanmewara/Topic-Modelling.git

Cloning into 'Topic-Modelling'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 3 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.


In [None]:
# pip install pyLDAvis

In [4]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmaObj = WordNetLemmatizer()

from sklearn import decomposition
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  from collections import Iterable


In [5]:
data = pd.read_csv('/content/Topic-Modelling/cars_reviews.csv')
data.head()

Unnamed: 0,Review
0,Not for everyone. Fun to drive-- a tall man's...
1,
2,"This truck is the best truck you could \rbuy,..."
3,I love the car so far!
4,I have owned the 300H now for two years. Gas...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85127 entries, 0 to 85126
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  67958 non-null  object
dtypes: object(1)
memory usage: 665.2+ KB


In [7]:
data.shape

(85127, 1)

## Data Preprocessing

### Remove Nan Values

In [8]:
data.isnull().sum()

Review    17169
dtype: int64

In [9]:
data.dropna(inplace=True)
data = data.reset_index(drop=True)

In [10]:
data = data.reset_index(drop=True)

In [11]:
data.head()

Unnamed: 0,Review
0,Not for everyone. Fun to drive-- a tall man's...
1,"This truck is the best truck you could \rbuy,..."
2,I love the car so far!
3,I have owned the 300H now for two years. Gas...
4,I had a 2007 Chrysler Mini Van that was runni...


In [12]:
data.shape

(67958, 1)

### Data Cleaning

In [13]:
def data_preprocess_func(data):
    
    data = str(data).lower().strip()
    
    # Replace special characters with their string
    data = data.replace('%', ' percent')
    data = data.replace('$', ' dollar ')
    data = data.replace('₹', ' rupee ')
    data = data.replace('€', ' euro ')
    data = data.replace('@', ' at ')
    data = data.replace('#', '')
    
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    data_decontracted = []

    for word in data.split():
        if word in contractions:
            word = contractions[word]

        data_decontracted.append(word)

    data = ' '.join(data_decontracted)
    data = data.replace("'ve", " have")
    data = data.replace("n't", " not")
    data = data.replace("'re", " are")
    data = data.replace("'ll", " will")
    
    # Removing HTML tags
    data = BeautifulSoup(data)
    data = data.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    data = re.sub(pattern, ' ', data).strip()

    
    return data

In [14]:
data['clean_reviews'] = data['Review'].apply(data_preprocess_func)

In [15]:
# Remove Short Words
data['clean_reviews'] = data['clean_reviews'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [16]:
data.head()

Unnamed: 0,Review,clean_reviews
0,Not for everyone. Fun to drive-- a tall man's...,not for everyone fun drive tall man sports car...
1,"This truck is the best truck you could \rbuy,...",this truck the best truck you could buy its de...
2,I love the car so far!,love the car far
3,I have owned the 300H now for two years. Gas...,have owned the 300h now for two years gas mile...
4,I had a 2007 Chrysler Mini Van that was runni...,had 2007 chrysler mini van that was running fi...


## Tokenization

In [17]:
tokenized_reviews = data['clean_reviews'].apply(lambda x: x.split()) #tokenizing
tokenized_reviews.head()

0    [not, for, everyone, fun, drive, tall, man, sp...
1    [this, truck, the, best, truck, you, could, bu...
2                                [love, the, car, far]
3    [have, owned, the, 300h, now, for, two, years,...
4    [had, 2007, chrysler, mini, van, that, was, ru...
Name: clean_reviews, dtype: object

## Lemmatization

In [18]:
lemmatized_reviews = tokenized_reviews.apply(lambda x: [lemmaObj.lemmatize(word,pos='v') for word in x])
lemmatized_reviews.head()

0    [not, for, everyone, fun, drive, tall, man, sp...
1    [this, truck, the, best, truck, you, could, bu...
2                                [love, the, car, far]
3    [have, own, the, 300h, now, for, two, years, g...
4    [have, 2007, chrysler, mini, van, that, be, ru...
Name: clean_reviews, dtype: object

In [19]:
data['lemmatized_reviews'] = lemmatized_reviews
data['lemmatized_reviews'] = data['lemmatized_reviews'].apply(lambda x: ' '.join(x))

In [20]:
data.head()

Unnamed: 0,Review,clean_reviews,lemmatized_reviews
0,Not for everyone. Fun to drive-- a tall man's...,not for everyone fun drive tall man sports car...,not for everyone fun drive tall man sport car ...
1,"This truck is the best truck you could \rbuy,...",this truck the best truck you could buy its de...,this truck the best truck you could buy its de...
2,I love the car so far!,love the car far,love the car far
3,I have owned the 300H now for two years. Gas...,have owned the 300h now for two years gas mile...,have own the 300h now for two years gas mileag...
4,I had a 2007 Chrysler Mini Van that was runni...,had 2007 chrysler mini van that was running fi...,have 2007 chrysler mini van that be run fine w...


## Spliting the data into Train and Test

In [21]:
X_train, X_test = train_test_split(data['lemmatized_reviews'], test_size=0.30, random_state=1)

### TFIDF Vectorizer

In [22]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.80,min_df=20,max_features=20000,stop_words='english',use_idf=False, norm=None)
tfidf = tfidf_vectorizer.fit_transform(X_train)

In [26]:
tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
tfidf.shape

(47570, 4907)

## Model Building

### LDA with 20 topics

In [27]:
lda = decomposition.LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50, n_jobs=1)
lda_ = lda.fit_transform(tfidf)
lda_comp = lda.components_

In [34]:
lda_

array([[0.00113636, 0.00113636, 0.00113636, ..., 0.00113636, 0.17681355,
        0.00113636],
       [0.0125    , 0.0125    , 0.0125    , ..., 0.0125    , 0.29159875,
        0.0125    ],
       [0.00102041, 0.00102041, 0.00102041, ..., 0.00102041, 0.00102041,
        0.00102041],
       ...,
       [0.00217391, 0.00217391, 0.00217391, ..., 0.00217391, 0.00217391,
        0.00217391],
       [0.005     , 0.005     , 0.005     , ..., 0.005     , 0.53613149,
        0.005     ],
       [0.00384615, 0.00384615, 0.00384615, ..., 0.00384615, 0.00384615,
        0.00384615]])

In [33]:
lda.components_

array([[ 0.05525949,  0.05      ,  0.05      , ...,  0.05      ,
         0.05      ,  0.05000006],
       [ 0.05002546,  0.05      , 31.4324638 , ...,  0.05      ,
         0.05      ,  0.05      ],
       [ 0.05      ,  0.05      ,  0.05      , ...,  0.05000016,
         0.05      ,  0.05      ],
       ...,
       [ 0.05      ,  0.05      ,  0.05      , ...,  0.05      ,
         0.05      ,  0.05      ],
       [ 0.05089809,  0.05      ,  0.05      , ...,  0.05      ,
         0.05000001,  0.05      ],
       [ 0.05      ,  0.05      ,  0.05      , ...,  0.05000001,
         0.05      ,  0.05      ]])

### Get topics

In [40]:
num_words = 20

vocab = np.array(tfidf_vectorizer.get_feature_names())
top_words = lambda words: [vocab[i] for i in np.argsort(words)[:-num_words-1:-1]]
topic_words = ([top_words(c) for c in lda_comp])
topics = [' '.join(words) for words in topic_words]

In [41]:
# top 20 topics and 20 words
for i in range(len(topics)):
    print("Topic {}: ".format(i+1), topics[i])

Topic 1:  toyota vehicle van new purchase subaru honda quality buy trade mazda better 2001 model price build make mini previous minivan
Topic 2:  kia hyundai guess near learn golf failure land warranty gti cruiser elantra sonata santa various volkswagen accent sorento present afraid
Topic 3:  winter fast cold weather air summer heat hot loose mount start live warm cool tend turn head heater slow condition
Topic 4:  seat control steer wheel rear tire power like road noise interior driver make turn light brake radio door sound hard
Topic 5:  car drive ride great smooth handle interior comfortable feel power like quiet look nice good engine excellent love sound fun
Topic 6:  turn fit finish tranny machine older radius 4runner smile face cloth roof model black taurus glass wrangler head awesome 70k
Topic 7:  shift transmission gear actually manual key park step traffic clutch 2nd middle 3rd rpm auto stick 1st easier bag trans
Topic 8:  car drive great fun love handle snow recommend look ro

### Graph Representation

In [42]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tfidf,tfidf_vectorizer,mds='tsne')

## Prediction on Test data

In [35]:
pd.set_option('display.max_colwidth', -1)
X_test[6:7].values

array(['the mercury mountaineer great suv the ride and handle excellent the power superb and the comfort great and from what can see the leather and other interior materials be great quality and the navigation system cool and the power moon roof and the sirius satellite radio be awesome and the rear park sensors come handy when parallel park and just really happy with mountaineer its be great far not problem and everything work great and its great price for what you get mine be only dollar 500 and the honda pilot with all the same equipment be over dollar 000 get great vehicle and great deal highly recommend the mercury mountaineer'],
      dtype=object)

In [36]:
lda_pred = lda.transform(tfidf_vectorizer.transform(X_test[6:7]))

In [37]:
lda_pred

array([[0.00084746, 0.00084746, 0.00084746, 0.00084746, 0.25245536,
        0.00084746, 0.02371018, 0.08487382, 0.00084746, 0.00084746,
        0.07437433, 0.00084746, 0.00084746, 0.10751529, 0.00084746,
        0.00084746, 0.08409975, 0.00084746, 0.36195432, 0.00084746]])

In [38]:
def topic_prediction(text):
  col_names = ['Topic_' + str(i) for i in range(20)] # number of componants
  index_names = ['Review_' + str(i) for i in range(len([text]))]
  # creating a dataframe of prediction
  df_topics = pd.DataFrame(np.round(lda_pred, 2), columns=col_names, index=index_names)
  # maxmimum_significant_topic_value
  significant_topic = np.argmax(df_topics.values, axis=1)
  df_topics['significant_topic'] = significant_topic
  
  for i in significant_topic:
    return [topics[i]]

In [43]:
topic = topic_prediction(X_test[6:7].values)

In [44]:
topic

['seat car room space like use feature need drive really great lot vehicle fit small comfortable easy good suv cargo']

## Save the model

In [45]:
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
  pickle.dump(tfidf_vectorizer, f)

In [46]:
with open('lda_model.pkl', 'wb') as f:
  pickle.dump(lda, f)