# Latent Dirichlet Allocation

In [18]:
# import librairie
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Load dataset
data = pd.read_csv('data', sep=",", header=None)
data.columns = ['text']
data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


The data is a collection of emails that are not labelled. Let's try extract topics from them!

## Preprocessing 

👇 You're used to it by now... Clean up! Store the cleaned text in a new dataframe column "clean_text".

In [20]:
#remove ponctuation 
data["clean_text"] = data['text'].str.translate(str.maketrans('','',string.punctuation))
#remove upper case
data['clean_text']=data['clean_text'].str.lower()
#remove numbers
data['clean_text'] = data['clean_text'].str.replace(r'\d+','', regex=True)
#remove stop words
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in word_tokens if not word in stop_words])
    return filtered_text

data['clean_text'] = data['clean_text'].apply(remove_stopwords)
#lemmatize
def lemmatize_text(text):
  
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = " ".join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_words

data['clean_text'] = data['clean_text'].apply(lemmatize_text)

In [21]:
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,gldcunixbcccolumbiaedu gary l dare subject sta...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,minerkuhubccukansedu subject ancient book orga...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,vzhivovsuperiorcarletonca vladimir zhivov subj...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,jerrybeskimocom jerry kaufman subject prayer a...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,golchowyalchemychemutorontoca gerald olchowy s...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,jaynemmaltguildorg jayne kulikauskas subject q...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,sclarkepasutorontoca susan clark subject pick ...


## Latent Dirichlet Allocation model

👇 Train an LDA model to extract potential topics.

In [22]:
from sklearn.decomposition import LatentDirichletAllocation

In [31]:
vectorizer = TfidfVectorizer().fit(data['clean_text'])
data_vectorized = vectorizer.transform(data['clean_text'])
lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

## Visualize potential topics

👇 The function to print the words associated with the potential topics is already made for you. You just have to pass the correct arguments!

In [32]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i],topic[i])
              for i in topic.argsort()[:-10-1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('chi', 2.6685094495086075), ('det', 2.0674976863375814), ('bos', 1.919179264652361), ('pit', 1.8939816719292795), ('buf', 1.6899258831579502), ('cal', 1.638765576776278), ('tor', 1.5347142518262127), ('que', 1.4389783971997983), ('mtl', 1.3937268987685103), ('phi', 1.3898478751231562)]
Topic 1:
[('god', 35.57194016445812), ('game', 26.72206700935704), ('would', 25.95737534746873), ('team', 25.45007461494964), ('one', 24.15667959549748), ('line', 22.907337896952814), ('subject', 22.71345239589592), ('christian', 22.33605603202051), ('organization', 21.805878830091654), ('university', 21.68758293574393)]


## Predict topic of new text

👇 You can now use your LDA model to predict the topic of a new text. First, use your vectorizer to vectorize the example. Then, use your LDA model to predict the topic of the vectorized example.

In [33]:
new_text = ["i love play video game since i was young"]
new_text_vectorized = vectorizer.transform(new_text)
lda_vectors = lda_model.transform(new_text_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])


topic 0 : 0.15537712719337737
topic 1 : 0.8446228728066226
