# **Compare Topics between Twitter and TikTok descriptions for the same brand: Gucci**

In this notebook I will do NLP of Gucci TikTok Descriptions and Twitter tweets. More scpecially LDA topic modelling:

https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05

https://lazarinastoy.com/topic-modelling-lda/

https://radimrehurek.com/gensim/models/ldamodel.html

Code-along video: https://www.youtube.com/watch?v=TKjjlp5_r7o


## **Installing and importing the librairies**

In [None]:
%%capture

# Install all needed libraries

!pip install pyLDAvis==2.1.2 -qq
!pip install gensim -qq

In [None]:
# import necessary libraries
import pandas as pd
from datetime import datetime
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  from collections import Iterable


## **Importing and Process TikTok Json Data**

In [None]:
# Load TikTok Data
df = pd.read_json('/content/drive/MyDrive/Final Project/4. Natural Language Processing/GucciTikTokDesc.json')
df.head()

Unnamed: 0,id,desc,createTime
0,6906535387636780032,Moving with the #GucciGrip watch,1608053082
1,6904284575254531072,Get it with #GucciGift,1607529023
2,6904224653649153024,Let’s do this one together. #GucciGift,1607515072
3,6903903579703086080,Moving into the holidays. #GucciGift,1607440316
4,6903809411899264000,Dancing at the Gucci Party. #GucciGift,1607418391


In [None]:
# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          254 non-null    int64 
 1   desc        254 non-null    object
 2   createTime  254 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 6.1+ KB


In [None]:
# Remove duplicates
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 0 to 171
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          142 non-null    int64 
 1   desc        142 non-null    object
 2   createTime  142 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.4+ KB


In [None]:
# Reformat timestamp dates
df['createTime'] = [datetime.fromtimestamp(x) for x in df['createTime']]
df['createTime'] = df['createTime'].dt.date
df.head()

Unnamed: 0,id,desc,createTime
0,6906535387636780032,Moving with the #GucciGrip watch,2020-12-15
1,6904284575254531072,Get it with #GucciGift,2020-12-09
2,6904224653649153024,Let’s do this one together. #GucciGift,2020-12-09
3,6903903579703086080,Moving into the holidays. #GucciGift,2020-12-08
4,6903809411899264000,Dancing at the Gucci Party. #GucciGift,2020-12-08


In [None]:
# Print the time period we will have to apply to twitter data to analyze the same scope
date_min = df['createTime'].min()
date_max = df['createTime'].max()

print(date_min)
print(date_max)

2020-02-07
2021-10-30


## **Preparing TikTok Data**


In [None]:
# Set stopwords
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Lemmatize TikTok Descriptions
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(df['desc'])
print(lemmatized_texts[0][0:90])

move watch


In [None]:
# Gensim processing of TikTok descriptions
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print(data_words[0][0:20])

['move', 'watch']


In [None]:
# Set the corpus of words
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print(corpus[0][0:20])

word = id2word[[0][:1][0]]
print(word)

[(0, 1), (1, 1)]
move


In [None]:
# Create LDA Model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

## **Vizualizing TikTok Data**

In [None]:
# TikTok Topic Modeling Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

## **Compare with Tweet Data**

In [None]:
# Load and get only Gucci and McQueen Tweets
dftweets = pd.read_csv('/content/drive/MyDrive/Final Project/2. Sentiment Analysis/Tweet_Posts.csv')

dftweets = dftweets[dftweets['brand']=='Gucci']

# only the relevent columns
dftweets = dftweets.iloc[:, [3,4,5]]

# replace nan 
dftweets.replace(np.nan,'',inplace=True)

# show data
dftweets.head(3)

Unnamed: 0,id,created_at,text
39217,1458706068046065664,2021-11-11 08:00:02,minashin joins as new gucci global brand ambas...
39218,1458675864183484419,2021-11-11 06:00:01,joining gucci as new global brand ambassador i...
39219,1458226659363106816,2021-11-10 00:15:02,at the london premiere of and pinault all wo...


In [None]:
# Reformat time variable
dftweets['created_at'] = pd.to_datetime(dftweets['created_at'])
dftweets['created_at'] = dftweets['created_at'].dt.date
dftweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3250 entries, 39217 to 42466
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          3250 non-null   int64 
 1   created_at  3250 non-null   object
 2   text        3250 non-null   object
dtypes: int64(1), object(2)
memory usage: 101.6+ KB


In [None]:
# Filter data between the two dates of the scope
dftweets = dftweets.loc[(dftweets['created_at'] >= date_min)
                     & (dftweets['created_at'] <= date_max)]

dftweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1601 entries, 39282 to 40882
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1601 non-null   int64 
 1   created_at  1601 non-null   object
 2   text        1601 non-null   object
dtypes: int64(1), object(2)
memory usage: 50.0+ KB


In [None]:
# Lemmatize tweets data
lemmatized_tweets = lemmatization(dftweets['text'])
print(lemmatized_tweets[0][0:90])

come soon boulevard city stud


In [None]:
# Gensim processing tweets data
tweets_words = gen_words(lemmatized_tweets)
print(tweets_words[0][0:20])

['come', 'soon', 'boulevard', 'city', 'stud']


In [None]:
# Set corpus of words
id2word_tweets = corpora.Dictionary(tweets_words)

corpus_tweets = []
for text in tweets_words:
    new = id2word_tweets.doc2bow(text)
    corpus_tweets.append(new)

print(corpus_tweets[0][0:20])

word_tweets = id2word_tweets[[0][:1][0]]
print(word_tweets)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
boulevard


In [None]:
# Create LDA model
lda_model_tweets = gensim.models.ldamodel.LdaModel(corpus=corpus_tweets,
                                           id2word=id2word_tweets,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
# Tweets Topic Modeling visualization
pyLDAvis.enable_notebook()
vis_tweets = pyLDAvis.gensim.prepare(lda_model_tweets, corpus_tweets, id2word_tweets, mds="mmds", R=30)
vis_tweets