In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
data = pd.read_csv('comments4.csv',nrows=20)
df = pd.DataFrame(data)

df['documents'] = df['Comment']
df = df.drop(['Video ID','Title'],axis=1)
df

Unnamed: 0,Comment,documents
0,Swifties LANG MALAKAS!! Frankiana,Swifties LANG MALAKAS!! Frankiana
1,pa shout out po idol..,pa shout out po idol..
2,sana makita kita sa personal yan ang pangarap ...,sana makita kita sa personal yan ang pangarap ...
3,Millennial superstars...,Millennial superstars...
4,Hi myx if ever you will consider them to be re...,Hi myx if ever you will consider them to be re...
5,FRANKIANA lang,FRANKIANA lang
6,So nyc to watch fankiana.. they r so fun to wa...,So nyc to watch fankiana.. they r so fun to wa...
7,💜💜💜,💜💜💜
8,You're my my my......Lover!Aayyyieee💞👩‍❤️‍👩💞Fr...,You're my my my......Lover!Aayyyieee💞👩‍❤️‍👩💞Fr...
9,LOVERR💞💞💞,LOVERR💞💞💞


In [14]:
# Preprocessing
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,Comment,documents,clean_documents
0,Swifties LANG MALAKAS!! Frankiana,Swifties LANG MALAKAS!! Frankiana,swifties lang malakas frankiana
1,pa shout out po idol..,pa shout out po idol..,shout out idol
2,sana makita kita sa personal yan ang pangarap ...,sana makita kita sa personal yan ang pangarap ...,sana makita kita personal yan ang pangarap mak...
3,Millennial superstars...,Millennial superstars...,millennial superstars
4,Hi myx if ever you will consider them to be re...,Hi myx if ever you will consider them to be re...,myx ever you will consider them regular vjs ku...


In [15]:
from nltk.corpus import stopwords
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stopwords.words('english')])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc

In [9]:
from nltk.corpus import stopwords
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stopwords.words('english')])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc

In [16]:
df.head()

Unnamed: 0,Comment,documents,clean_documents
0,Swifties LANG MALAKAS!! Frankiana,Swifties LANG MALAKAS!! Frankiana,swifties lang malakas frankiana
1,pa shout out po idol..,pa shout out po idol..,shout idol
2,sana makita kita sa personal yan ang pangarap ...,sana makita kita sa personal yan ang pangarap ...,sana makita kita personal yan ang pangarap mak...
3,Millennial superstars...,Millennial superstars...,millennial superstars
4,Hi myx if ever you will consider them to be re...,Hi myx if ever you will consider them to be re...,myx ever consider regular vjs kung pwede pleas...


In [17]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.24002129, ..., 0.        , 0.33033819,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.22062635, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [18]:
X.shape 
# A56   U(5,5). S()

(20, 87)

In [19]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

In [20]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,swifties lang malakas frankiana,0.6671442812894507,-0.0798442497132017
1,shout idol,0.0,4e-16
2,sana makita kita personal yan ang pangarap mak...,0.0366319126327491,0.48858466894818
3,millennial superstars,-0.0,-6e-16
4,myx ever consider regular vjs kung pwede pleas...,0.0396971714353398,0.4460576741796546
5,frankiana lang,0.7804455952291368,-0.0701217325040987
6,nyc watch fankiana fun watch smile wth lov fra...,0.2803467947204996,-0.0416249338896491
7,,0.0,-0.0
8,lover aayyyieee frankiana,0.4995980637049758,0.1282267357289708
9,loverr,0.0,8e-16


In [21]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [22]:
dictionary

['aayyyieee',
 'amazing',
 'ang',
 'anong',
 'araw',
 'bagay',
 'bakit',
 'bonding',
 'challenge',
 'chemistry',
 'cite',
 'consider',
 'cover',
 'cute',
 'dahil',
 'dalawa',
 'diana',
 'faces',
 'fankiana',
 'forever',
 'frakiana',
 'franki',
 'frankiana',
 'fresh',
 'fun',
 'girls',
 'gusto',
 'idol',
 'interesting',
 'isa',
 'isat',
 'kahit',
 'kanila',
 'kanta',
 'kapag',
 'kayo',
 'kilig',
 'kita',
 'kung',
 'lakas',
 'lalo',
 'lang',
 'lov',
 'love',
 'lovely',
 'lover',
 'loverr',
 'luv',
 'mag',
 'magkasama',
 'makita',
 'malakas',
 'masaya',
 'mellow',
 'mga',
 'millennial',
 'myx',
 'nakakaaliw',
 'nakakakilig',
 'naming',
 'nyc',
 'nyo',
 'pangarap',
 'personal',
 'presence',
 'promote',
 'puso',
 'pwede',
 'regular',
 'rewatch',
 'sana',
 'saturday',
 'screen',
 'shout',
 'sila',
 'smile',
 'sobrang',
 'straight',
 'superstars',
 'swifties',
 'talaga',
 'vjs',
 'watch',
 'wow',
 'wth',
 'yan',
 'yung']

In [24]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T
encoding_matrix

Unnamed: 0,topic_1,topic_2
aayyyieee,0.1559828586169760,0.0644307365997948
amazing,0.0078219385984458,0.1130820533732904
ang,0.0665511638013911,0.2669432382425440
anong,0.0048703173540813,0.1058942701217250
araw,0.0061135102476403,0.1105555319158239
...,...,...
watch,0.0909311997395240,-0.0314524590923924
wow,0.0078219385984458,0.1130820533732903
wth,0.0418206952405248,-0.0099932693064342
yan,0.0055588136806931,0.1193220591718466
