In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from string import punctuation
import spacy
# you need to run python -m spacy download en
import nltk
from nltk import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import warnings, sys, os
from pathlib import Path


path = Path(os.getcwd())
sys.path.append(str(path.parent))
warnings.filterwarnings('ignore')  
%matplotlib inline
data_directory = '../data'


#pd.set_option('display.max_colwidth', -1)

In [2]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/zero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
summary = pd.read_csv(f'{data_directory}/dreamers_summary.csv', sep='|')
dream = pd.read_csv('../data/dreams_clean.csv', sep=';')
# Borro aquellos sueños que no tienen palabras y aquellos en aleman que son los del grupo con id 18, 26 y 27
dream = dream.dropna(axis=0, subset=['words']).drop(dream.loc[dream['group_id'].isin([18, 26, 27, 79, 80])].index)


In [4]:
df = pd.merge(dream, summary, left_on='group_id', right_on='id')
df

Unnamed: 0,code,note,description,words,group_id,group,dreamer sex,dreamer age,dream years,numbers of dreams,summary,id,total_words
0,1,1957,"The one at the Meads's house, where it's bigge...",154.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
1,2,8/11/67,I'm at a family reunion in a large fine house ...,248.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
2,3,8/1/85,I watch a plane fly past and shortly realize i...,303.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
3,4,1985?,Me pulling the green leaves and berries off so...,468.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
4,5,1985?,I'm in a room that reminds me of (but definite...,561.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36197,85,"F, age 18",The dream was about me and my boyfriend going ...,138.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36198,86,"F, age 18",Two weeks ago this guy asked me to Senior Ball...,96.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36199,87,"F, age 18",My boyfriend just broke up with me so he was o...,139.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36200,88,"F, age 18",I was in my backyard and I was flying. I would...,104.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0


In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

n_features = 1000
n_components = 10
n_top_words = 20

data_samples = df['description'].values.tolist()

In [6]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 2.910s.


In [7]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 2.835s.



In [31]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_features=%d..."
      % (n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5)

nmf.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_features=1000...
done in 3.552s.

Topics in NMF model (Frobenius norm):
Topic #0: said asked didn talking told wanted did thought looked came saying gave like called lady later know liked wasn walked
Topic #1: man say says woman room look feel walk want comes tell ask ll don try small talk men come right
Topic #2: like kind going just really little think know thing things stuff didn don dreamt doing laugh woke somebody sound sort
Topic #3: mom ezra dad went shop darren home nana stuff got called movie thing going computer playing weird game phone bed
Topic #4: went got saw came people room started door ran looked told left water didn tried place couldn walked outside took
Topic #5: school eugene calvin class evelyn teacher dmitri darius samantha elijah high ms mr sat year bus people thing photos got
Topic #6: car driving road drive drove got seat cars street parked going stopped stop driver fast truck police hill home parki

In [46]:
list_of_topics = []

for topic_distribution in nmf.transform(tfidf_vectorizer.transform(data_samples[:])):
    most_representative_topic = np.argsort(topic_distribution)[-1]
    list_of_topics.append(most_representative_topic)

topics_df = pd.DataFrame(list_of_topics, columns=['topic_number'])
#print(topics_df)
print("--------------------------------------------")
topics_df['topic_number'].value_counts()

--------------------------------------------


1    9073
4    5995
2    4145
8    2817
0    2779
3    2669
5    2460
7    2237
9    2135
6    1892
Name: topic_number, dtype: int64

In [21]:
nmf.transform(tfidf_vectorizer.transform(data_samples[:]))

array([[2.36610092e-06, 4.45563647e-02, 5.27550149e-03, ...,
        4.46264501e-03, 2.48929040e-09, 4.39284553e-02],
       [3.12010509e-03, 3.49051451e-02, 1.24793038e-05, ...,
        1.80433427e-02, 2.84126956e-03, 5.52729359e-03],
       [1.18032780e-02, 3.56934915e-02, 2.81379710e-03, ...,
        4.40052562e-03, 3.08644873e-02, 2.15654012e-02],
       ...,
       [5.74195291e-03, 3.02004034e-16, 3.65921032e-02, ...,
        2.82812817e-04, 4.14360945e-02, 1.54172270e-16],
       [3.35858508e-10, 2.33888078e-02, 2.02882996e-06, ...,
        9.15451765e-03, 4.29955540e-22, 1.81316670e-07],
       [1.55329426e-02, 3.46480778e-04, 3.83974386e-04, ...,
        5.92851305e-03, 4.95420948e-02, 2.19683570e-06]])

In [42]:
transformed = nmf.transform(tfidf_vectorizer.transform(data_samples[:1]))
print(transformed)
print(transformed/transformed.sum(axis=1, keepdims=True))
# Get the top predicted topic
#predicted_topics = [np.argsort(each)[::-1][0] for each in transformed]

[[0.         0.0148101  0.00983771 0.         0.00353218 0.
  0.00136396 0.         0.         0.0282674 ]]
[[0.         0.25617979 0.17016915 0.         0.06109834 0.
  0.02359331 0.         0.         0.48895941]]
0.02826739870685852


9

In [16]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_features=%d..."
      % (n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5)

nmf.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_features=1000...
done in 12.939s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: said went came didn asked got told left looked wanted room thought saw later like did going took started couldn
Topic #1: man say woman says feel want look comes tell like don walk try come ask large ll small men make
Topic #2: like kind remember dream going just don really know think little dreamt thing woke things didn doing reason wasn time
Topic #3: mom ezra dad shop home stuff car nana got thing darren going lots phone place food playing game house weird
Topic #4: saw dream came dreamed went felt started walking interpretation mother street boy answers looked ran questions away suddenly building running
Topic #5: school eugene calvin class teacher people playing dmitri evelyn high samantha year ms mr game elijah darius classmate went toilet
Topic #6: car driving bus road drive street goin

In [17]:
transformed = nmf.transform(tfidf_vectorizer.transform(data_samples[:1]))
print(transformed/transformed.sum(axis=1, keepdims=True))
print(sum((transformed/transformed.sum(axis=1, keepdims=True))[0]))
# Get the top predicted topic
predicted_topics = [np.argsort(each)[::-1][0] for each in transformed]
predicted_topics

[[2.10115258e-05 3.95670536e-01 4.68476377e-02 2.48040615e-04
  1.00993391e-05 6.42771144e-03 1.21051086e-01 3.96292977e-02
  2.21054266e-08 3.90094558e-01]]
1.0


[1]

In [18]:
print("Fitting LDA models with tf features, "
      "n_features=%d..."
      % (n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_features=1000...
done in 42.017s.

Topics in LDA model:
Topic #0: shop bus train wearing calvin clothes shirt pants shoes fat blue bag zombies black gay dress white dmitri lots pair
Topic #1: dog plane samantha cat kids classmate flying fly school evelyn machine animals wally frank ship dogs island dawson cards bench
Topic #2: house mother father friend brother sister old baby home room family truck boy girl uncle living bed wife parents dream
Topic #3: man woman say room says like don want tell look walk people know feel think ask large comes talk men
Topic #4: car street road driving going way building walking drive door hill people police seat got left cars right outside drove
Topic #5: said went got came mom didn like guy saw looked people started told asked room going ezra dad couldn thought
Topic #6: people guy game like gun playing man hit head away shot kill dead killed run black running trying ball ground
Topic #7: dream class school drea

In [19]:
transformed = lda.transform(tf_vectorizer.transform(data_samples[:1]))
print(transformed)
print(sum(transformed[0]))
predicted_topics = [np.argsort(each)[::-1][0] for each in transformed]
predicted_topics


[[0.00200053 0.00200014 0.13155248 0.52419233 0.12359263 0.00200035
  0.07003715 0.00200019 0.00200008 0.14062412]]
1.0000000000000002


[3]