##IMPORTS


In [None]:
import pandas as pd
import numpy as np
!pip install pdfplumber pandas

Collecting pdfplumber
  Downloading pdfplumber-0.10.2-py3-none-any.whl (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.5/47.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20221105 (from pdfplumber)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.19.0-py3-none-manylinux_2_17_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20221105 pdfplumber-0.10.2 pypdfium2-4.19.0


##Data: Book: Choose Your Enemies Wisely: Business Planning for the Audacious Few
Book by Patrick Bet-David

In [None]:
import pdfplumber
import pandas as pd

pdf_path = '/content/nmy.pdf'
data = []

with pdfplumber.open(pdf_path) as pdf:
    for page_num in range(len(pdf.pages)):
        page = pdf.pages[page_num]
        text = page.extract_text()
        data.append({'Text': text})

df = pd.DataFrame(data)
print(df)


                                                  Text
0                                                     
1    Thank you for downloading\nthis Simon & Schust...
2                                                     
3    To my father, Gabreal Bet-David, the Aristotle...
4    Author’s Note\nI tell stories that go back mor...
..                                                 ...
275  About the Author\nPatrick Bet-David went from ...
276                SimonandSchuster.com\n@GalleryBooks
277  We hope you enjoyed\nreading this Simon &\nSch...
278  Gallery Books\nAn Imprint of Simon & Schuster,...
279  Classi cation: LCC HD30.28 .B4585 2020 (print)...

[280 rows x 1 columns]


##Data Processing

In [None]:
df.dropna(axis=1, how='all', inplace=True)


In [None]:
df = df[df.apply(lambda row: row.str.strip().ne('').any(), axis=1)]


In [None]:
df

Unnamed: 0,Text
1,Thank you for downloading\nthis Simon & Schust...
3,"To my father, Gabreal Bet-David, the Aristotle..."
4,Author’s Note\nI tell stories that go back mor...
5,Introduction\nBefore Your First Move\nWhen I �...
6,prepared to launch another series of moves bas...
...,...
275,About the Author\nPatrick Bet-David went from ...
276,SimonandSchuster.com\n@GalleryBooks
277,We hope you enjoyed\nreading this Simon &\nSch...
278,"Gallery Books\nAn Imprint of Simon & Schuster,..."


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string


nltk.download('punkt')
nltk.download('stopwords')


df = pd.DataFrame(df)

df['Text'] = df['Text'].apply(lambda x: x.lower())

df['Text'] = df['Text'].apply(lambda x: word_tokenize(x))
df['Text'] = df['Text'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

stemmer = PorterStemmer()
df['Text'] = df['Text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

df['Text'] = df['Text'].apply(lambda tokens: ' '.join(tokens))

print(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                  Text
1    thank download simon schuster ebook get free e...
3               father gabreal bet-david aristotl life
4    author ’ note tell stori go back thirti year b...
5    introduct first move  rst watch magnu document...
6    prepar launch anoth seri move base market comp...
..                                                 ...
275  author patrick bet-david went escap war-torn i...
276                   simonandschuster.com gallerybook
277  hope enjoy read simon schuster ebook get free ...
278  galleri book imprint simon schuster inc. 1230 ...
279  classi c lcc hd30.28 .b4585 2020 print lcc hd3...

[274 rows x 1 columns]


In [None]:
df

Unnamed: 0,Text
1,Thank you for downloading\nthis Simon & Schust...
3,"To my father, Gabreal Bet-David, the Aristotle..."
4,Author’s Note\nI tell stories that go back mor...
5,Introduction\nBefore Your First Move\nWhen I �...
6,prepared to launch another series of moves bas...
...,...
275,About the Author\nPatrick Bet-David went from ...
276,SimonandSchuster.com\n@GalleryBooks
277,We hope you enjoyed\nreading this Simon &\nSch...
278,"Gallery Books\nAn Imprint of Simon & Schuster,..."


##TOPIC MODELLING with our data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['Text'])


In [None]:
dtm

<274x3583 sparse matrix of type '<class 'numpy.int64'>'
	with 27088 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 50
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(dtm)


In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx + 1}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()

print_top_words(lda, feature_names, n_top_words)


Topic #1: insurance people narrative don nancial things competitors time result sell
Topic #2: company make business going time people need identify making want
Topic #3: want people ve moves business know need time life don
Topic #4: recognition need praise pressure don plaque ayrton senna knowing erent
Topic #5: time speed make business day week ve want going don
Topic #6: make time right processing people pat result check big going
Topic #7: said like people want rogan erent di truth life future
Topic #8: 000 people cost savings year policies business time help project
Topic #9: father michael ebook think meeting schuster click sign simon life
Topic #10: trouble ectively systems speed manage business dangerous chaos build world
Topic #11: investment ask company phase start need want ce ma started
Topic #12: people make business better don life person burry trust want
Topic #13: lived deal power options customer leverage like iran don dad
Topic #14: business people systems time make 