# LDA Modeling: Finding from which books where a bunch of random pages taken from

In [1]:
import numpy as np

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [3]:
read_dictionary = np.load('list_of_documents.npy',allow_pickle='TRUE').item()

## I. Step 1: Preprocessing

The following code builds a function which tasks are to lower case the text, remove punctuation, whitespaces and stop words. After that, we will be tokenizing the text for it to be easier to analyse later.

In [4]:
import string
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
# Add additional words to the stopwords list
stop_words.update(['like', 'saw', 'mr', 'miss', 'mrs', 
                   'sir', 'say', 'come', 'man', 'know', 'says', 'said', 'would', 'look', 'could'])

def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # remove whitespaces
    text = text.strip()

    # tokenize the text
    tokens = text.split()

    # remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # join the tokens back into a string
    text = ' '.join(tokens)

    return text

#### Apply the preprocess_text function to the 'Value' column

Create a list to store the preprocessed values

In [5]:
preprocessed_values = []

Loop over the dictionary items and apply the preprocess_text function to each value

In [6]:
for key, value in read_dictionary.items():
    preprocessed_values.append(preprocess_text(value))

Store the preprocessed values back into the dictionary

In [7]:
read_dictionary['Value'] = preprocessed_values

#### Tokenize each value in the preprocessed_values list

In [8]:
import nltk
nltk.download('punkt')

preprocessed_values = [nltk.word_tokenize(value) for value in preprocessed_values]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kenzasqalli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
punct=["“", "—“", "—", "’", "” “","”", "——"]

for i in preprocessed_values:
    for j in i:
        if j in punct:
            i.remove(j)

In [10]:
preprocessed_values

[['xxxii',
  'one',
  'day',
  'busy',
  'books',
  'pocket',
  'received',
  'note',
  'post',
  'mere',
  'outside',
  'threw',
  'great',
  'flutter',
  'though',
  'never',
  'seen',
  'handwriting',
  'addressed',
  'divined',
  'whose',
  'hand',
  'set',
  'beginning',
  'dear',
  'pip',
  'dear',
  'pip',
  'dear',
  'dear',
  'anything',
  'ran',
  'thus',
  'london',
  'day',
  'morrow',
  'midday',
  'coach',
  'believe',
  'settled',
  'meet',
  'events',
  'havisham',
  'impression',
  'write',
  'obedience',
  'sends',
  'regard',
  'estella',
  'time',
  'probably',
  'ordered',
  'several',
  'suits',
  'clothes',
  'occasion',
  'fain',
  'content',
  'appetite',
  'vanished',
  'instantly',
  'knew',
  'peace',
  'rest',
  'day',
  'arrived',
  'arrival',
  'brought',
  'either',
  'worse',
  'ever',
  'began',
  'haunting',
  'coach',
  'office',
  'wood',
  'street',
  'cheapside',
  'coach',
  'left',
  'blue',
  'boar',
  'town',
  'knew',
  'perfectly',
  'well',

#### Stemming and Lemmatization

* Stemming is the process of reducing words to their base form, typically by removing the suffixes (-ing, -ly, -es, etc.) For example, the stemming algorithm would reduce the words "running," "runner," and "ran" to the root form "run."

* Lemmatization, on the other hand, takes into account the context of the word in the sentence and reduces the word to its root form based on its intended meaning. Lemmatization uses morphological analysis and understands the meaning of the words, so the root form of a word is its lemma or dictionary form. For example, the lemma for the word "running" is "run," and the lemma for the word "better" is "good."

Stemming and Lemmatization are two common methods for reducing words to their base or root form in Natural Language Processing (NLP). These techniques are useful in pre-processing because they can help reduce the dimensionality of the data, eliminate word variability, and improve the interpretability of the text data.

In [11]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Perform stemming on the tokenized values
preprocessed_values = [[stemmer.stem(word) for word in value] for value in preprocessed_values]

# Perform lemmatization on the tokenized values
preprocessed_values = [[lemmatizer.lemmatize(word) for word in value] for value in preprocessed_values]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kenzasqalli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kenzasqalli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### Vectorization

Vectorization is important because it transforms text data into numerical data, which is necessary for many machine learning algorithms to function.

By converting it into numerical data, we can apply mathematical and statistical techniques to analyze and understand the relationships within the data. Vectorization helps to extract features and representations of text data, which can then be used as input to various machine learning models. It allows us to convert words into numerical values in a way that maintains semantic meaning, making it possible to perform tasks like classification, clustering, and regression on text data.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [" ".join(doc) for doc in preprocessed_values]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the documents
vectorizer.fit(documents)

# Transform the documents into a vectorized representation
vectorized_values = vectorizer.transform(documents)

In [13]:
print(vectorized_values)

  (0, 187)	1
  (0, 189)	2
  (0, 206)	1
  (0, 227)	2
  (0, 247)	2
  (0, 254)	1
  (0, 263)	1
  (0, 267)	2
  (0, 279)	2
  (0, 308)	2
  (0, 326)	1
  (0, 346)	1
  (0, 350)	2
  (0, 370)	1
  (0, 378)	1
  (0, 433)	1
  (0, 442)	1
  (0, 450)	3
  (0, 475)	3
  (0, 528)	3
  (0, 529)	1
  (0, 549)	1
  (0, 554)	1
  (0, 586)	1
  (0, 596)	1
  :	:
  (141, 10721)	1
  (141, 10724)	1
  (141, 10734)	1
  (141, 10739)	2
  (141, 10746)	1
  (141, 10772)	6
  (141, 10773)	5
  (141, 10781)	8
  (141, 10785)	2
  (141, 10787)	3
  (141, 10791)	1
  (141, 10800)	7
  (141, 10802)	2
  (141, 10803)	3
  (141, 10806)	1
  (141, 10817)	1
  (141, 10822)	4
  (141, 10836)	1
  (141, 10842)	6
  (141, 10848)	1
  (141, 10901)	4
  (141, 10902)	4
  (141, 10915)	6
  (141, 10916)	1
  (141, 10924)	2


# II. Step 2: LDA Model 

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

perplexity_scores = []
for n_topics in range(1, 20):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(vectorized_values)
    perplexity = lda.perplexity(vectorized_values)
    perplexity_scores.append(perplexity)


In [15]:
print(perplexity_scores)

[2912.0015172159133, 2578.1674918856033, 2540.890791004859, 2559.754133494555, 2616.4621902765984, 2614.668382856025, 2578.906064512704, 2587.5425427316936, 2619.911300663646, 2686.676630960147, 2720.4588297281352, 2712.081988926181, 2730.9739219869493, 2764.445621964924, 2843.573891437399, 2858.941851043854, 2826.620819704846, 2824.3334968348495, 2842.101358825661]


In [22]:
from sklearn.decomposition import LatentDirichletAllocation

# Create an instance of LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=3, learning_method='online')

# Fit the LDA model on the vectorized representation of the documents
lda.fit(vectorized_values)

In [23]:
# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get the topic components from the LDA model
components = lda.components_

# Loop over each topic
for topic_idx, topic in enumerate(components):
    # Print the topic index and the top 10 words for each topic
    print("Topic %d:" % topic_idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Topic 0:
captain nautilu sea water nemo ned land one conseil ocean
Topic 1:
one go linton time hand catherin look heathcliff see littl
Topic 2:
joe wemmick wopsl one skiffin time pip jagger look sister
