# Latent Dirichlet Allocation

In [52]:
# import librairie
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re


In [61]:
# Load dataset
data = pd.read_csv('data', sep=",", header=None)
data.columns = ['text']
data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


The data is a collection of emails that are not labelled. Let's try extract topics from them!

## Preprocessing 

👇 You're used to it by now... Clean up! Store the cleaned text in a new dataframe column "clean_text".

In [62]:
# We start by remove e-mail adress
def remove_mail(text):
    
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    text_without_emails = re.sub(email_pattern, '', text)
    return text_without_emails

data['clean_text'] = data['text'].apply(remove_mail)
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,"From: (Gary L Dare)\nSubject: Stan Fischler, ..."
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,From: (Cardinal Ximenez)\nSubject: Re: The ar...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,From: \nSubject: Re: Ancient Books\nOrganizati...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,From: (Cardinal Ximenez)\nSubject: Atheists a...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,From: (Vladimir Zhivov)\nSubject: Flames Trul...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,From: (Jerry Kaufman)\nSubject: Re: prayers a...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,From: (Gerald Olchowy)\nSubject: Re: If You W...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,From: (Jayne Kulikauskas)\nSubject: quality o...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,From: (Susan Clark)\nSubject: Who picks first...


In [63]:
def cleanning_text(df, column_name):
    df['clean_text'] = df['clean_text'].str.translate(str.maketrans('','',string.punctuation))
    df['clean_text']= df['clean_text'].str.lower()
    df['clean_text'] = df['clean_text'].str.replace(r'\d+','', regex=True)
    
cleanning_text(data,'clean_text' )
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,from gary l dare\nsubject stan fischler \nsum...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,from cardinal ximenez\nsubject re the arrogan...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,from \nsubject re ancient books\norganization ...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,from cardinal ximenez\nsubject atheists and h...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,from vladimir zhivov\nsubject flames truly br...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,from jerry kaufman\nsubject re prayers and ad...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,from gerald olchowy\nsubject re if you were p...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,from jayne kulikauskas\nsubject quality of ca...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,from susan clark\nsubject who picks first\nor...


In [64]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in word_tokens if not word in stop_words])
    return filtered_text

data['clean_text'] = data['clean_text'].apply(remove_stopwords)

def lemmatize_text(text):
  
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = " ".join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_words

data['clean_text'] = data['clean_text'].apply(lemmatize_text)

## Latent Dirichlet Allocation model

👇 Train an LDA model to extract potential topics.

In [65]:
vectorizer = TfidfVectorizer().fit(data['clean_text'])
data_vectorized = vectorizer.transform(data['clean_text'])
lda_model = LatentDirichletAllocation(n_components=3).fit(data_vectorized)

## Visualize potential topics

👇 The function to print the words associated with the potential topics is already made for you. You just have to pass the correct arguments!

In [66]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i],topic[i])
              for i in topic.argsort()[:-10-1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('chi', 3.1156675388893116), ('det', 2.4602528123836698), ('bos', 2.4074926378698027), ('cal', 2.259741459932837), ('pit', 1.9683514427461555), ('buf', 1.8196143303836076), ('tor', 1.6799844335908265), ('pitt', 1.6425889817195267), ('que', 1.513827505930599), ('howl', 1.499809133633879)]
Topic 1:
[('colon', 1.5840097290729214), ('testing', 1.5028500593861436), ('finalswho', 1.2958009269330877), ('statemaine', 1.2958009269215156), ('holger', 1.2762750513916064), ('finalswinner', 1.2079691957198726), ('tennessee', 1.0484661962480024), ('rfl', 1.0484661962480017), ('singapore', 1.026188873997567), ('ohlwein', 1.0202204328574698)]
Topic 2:
[('god', 35.97529628701285), ('game', 27.256114341175987), ('would', 26.323827070382105), ('team', 25.83993565386872), ('one', 24.438249212382697), ('line', 23.519485530952966), ('subject', 23.331137156674544), ('christian', 22.560833050126604), ('organization', 22.380166395060044), ('university', 22.22939473776321)]


## Predict topic of new text

👇 You can now use your LDA model to predict the topic of a new text. First, use your vectorizer to vectorize the example. Then, use your LDA model to predict the topic of the vectorized example.

In [67]:
new_text = ["i love play video game since i was young"]
new_text_vectorized = vectorizer.transform(new_text)
lda_vectors = lda_model.transform(new_text_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])
print("topic 2 :", lda_vectors[0][2])


topic 0 : 0.1009225064191745
topic 1 : 0.10092578996849774
topic 2 : 0.7981517036123278


# Analyse with Azure cognitive services¶

In [56]:
cog_key = 'a7404d50c1384d0680f9020fee700f96'
cog_endpoint = 'https://azure-ml-ai900-justine-31012023.cognitiveservices.azure.com/'

print('Ready to use cognitive services at {} using key {}'.format(cog_endpoint, cog_key))

Ready to use cognitive services at https://azure-ml-ai900-justine-31012023.cognitiveservices.azure.com/ using key a7404d50c1384d0680f9020fee700f96


In [57]:
# Create a collection of reviews with dataframe
reviews = []
for i in range(5):
    review = {"id": i, "text": data['clean_text'][i]}
    reviews.append(review)
    

for review_num in range(len(reviews)):
    # print the review text
     print('{}\n{}\n'.format(reviews[review_num]['id'], reviews[review_num]['text']))

0
gary l dare subject stan fischler summary devil pregame show prior hosting penguin nntppostinghost cunixbcccolumbiaedu replyto gary l dare organization phd hall line lester patrick award lunch bill torrey mentioned one option next season president miami team bob clarke working dinner clarke said worst mistake philadelphia letting mike keenan go retrospect almost player came realize keenan knew took win rumour circulating keenan back flyer nick polano sick scapegoat schedule made red wing bryan murray approved gerry meehan john muckler worried sabre prospect assistant lever say sabre get share quebec dynasty emerging mighty duck declared throw money around loosely buy team oiler coach ted green remarked guy around fill tie domis skate none fill helmet senator andrew mcbain told security guard chicago stadium warned stair leading locker room mcbain mouthed seasoned professional tumbled entire steep flight gld je souviens gary l dare go winnipeg jet go selanne domi stanley

1
cardinal x

## Detect language

In [58]:
# Get a client for your text analytics cognitive service resource
text_analytics_client = TextAnalyticsClient(endpoint=cog_endpoint,
                                            credentials=CognitiveServicesCredentials(cog_key))

# Analyze the reviews you read from the /data/reviews folder earlier
language_analysis = text_analytics_client.detect_language(documents=reviews)

# print detected language details for each review
for review_num in range(len(reviews)):
    # print the review id
    print(reviews[review_num]['id'])

    # Get the language details for this review
    lang = language_analysis.documents[review_num].detected_languages[0]
    print(' - Language: {}\n - Code: {}\n - Score: {}\n'.format(lang.name, lang.iso6391_name, lang.score))

    # Add the detected language code to the collection of reviews (so we can do further analysis)
    reviews[review_num]["language"] = lang.iso6391_name

0
 - Language: English
 - Code: en
 - Score: 0.9864864945411682

1
 - Language: English
 - Code: en
 - Score: 1.0

2
 - Language: English
 - Code: en
 - Score: 1.0

3
 - Language: English
 - Code: en
 - Score: 1.0

4
 - Language: English
 - Code: en
 - Score: 0.9785714149475098



## Extract Key Phrases

In [59]:
# Use the client and reviews you created in the previous code cell to get key phrases
key_phrase_analysis = text_analytics_client.key_phrases(documents=reviews)

# print key phrases for each review
for review_num in range(len(reviews)):
    # print the review id
    print(reviews[review_num]['id'])

    # Get the key phrases in this review
    print('\nKey Phrases:')
    key_phrases = key_phrase_analysis.documents[review_num].key_phrases
    # Print each key phrase
    for key_phrase in key_phrases:
        print('\t', key_phrase)
    print('\n')

0

Key Phrases:
	 mike keenan
	 cunixbcccolumbiaedu replyto gary
	 dinner clarke
	 season president miami team bob clarke
	 helmet senator andrew mcbain
	 stair leading locker room mcbain
	 dare organization phd hall line lester patrick award lunch bill torrey
	 team oiler coach ted green remarked guy
	 red wing bryan murray approved gerry meehan john muckler worried sabre prospect assistant lever
	 security guard chicago stadium
	 seasoned professional tumbled entire steep flight gld je
	 prior hosting penguin
	 flyer nick polano sick scapegoat schedule
	 option
	 tie domis skate
	 share quebec dynasty
	 subject stan fischler summary devil pregame
	 worst mistake philadelphia
	 throw money
	 mighty duck
	 winnipeg jet
	 retrospect
	 win rumour
	 player
	 selanne domi stanley


1

Key Phrases:
	 question god
	 use faith
	 picture god
	 logical argument nonexistence god id
	 ability reason
	 selfawareness reason
	 true athiests position proof existence god
	 right science reason
	 ignor

In [60]:
# Use the client and reviews you created previously to get sentiment scores
sentiment_analysis = text_analytics_client.sentiment(documents=reviews)

# Print the results for each review
for review_num in range(len(reviews)):

    # Get the sentiment score for this review
    sentiment_score = sentiment_analysis.documents[review_num].score

    # classifiy 'positive' if more than 0.5, 
    if sentiment_score < 0.5:
        sentiment = 'negative'
    else:
        sentiment = 'positive'

    # print file name and sentiment
    print('{} : {} ({})'.format(reviews[review_num]['id'], sentiment, sentiment_score))

0 : positive (0.9802271127700806)
1 : positive (0.9999222755432129)
2 : negative (0.13061869144439697)
3 : negative (0.0009417235851287842)
4 : negative (0.00018808245658874512)
