In [121]:
import nltk
import spacy
import requests
from bs4 import BeautifulSoup
import urllib.parse

In [122]:
import re

# Recommended Steps:

# 1-) Web Scraping
# 2-) Lowercasing
# 3-) Remove Punctuation
# 4-) Remove Stopwords
# 5-) Tokenize the text
# 6-) Lemmatize
# 7-) Customize the expressions
# 8-) Convert string to list (Optional) 
#
#


# Web Scraping 

In [123]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

def webden_texti_cek(base_url):
    response = requests.get(base_url)
    html_icerigi = response.content
    soup = BeautifulSoup(html_icerigi, "html.parser")
    
    main_page_texts = [p.text for p in soup.find_all("p")] #ana sayfadaki veriler
    
    urls = [i.get("href") for i in soup.find_all("a") if i.get("href")]
    normalized_urls = [urllib.parse.urljoin(base_url, link).rstrip('/') for link in urls if urllib.parse.urljoin(base_url, link).startswith(base_url)]
    filtered_urls = list(set(normalized_urls))  # Set ile unique eleman
    
    all_texts = []
    
    for link in filtered_urls:
        try:
            response = requests.get(link)
            html_icerigi = response.content
            soup = BeautifulSoup(html_icerigi, "html.parser")
            page_texts = [p.text for p in soup.find_all("p")]

            all_texts.extend(page_texts)  # <p> etiketlerindeki metinleri düz listeye ekle
        except requests.RequestException as e:
            print(f"URL'e erişilemedi: {link}")
            print(e)
        
    all_texts.extend(main_page_texts)
    all_texts_combined = ' '.join(all_texts)
    
    return all_texts_combined




In [49]:
digieye_textim = webden_texti_cek("https://digieye.ai/")

# Lowercasing

In [124]:
def lowercasing(text):
    lower_text = text.lower()
    return lower_text

print(lowercasing(digieye_textim))

our valuable customer & partners our valuable customer & partners contact us to seek answers for your product-related questions or schedule demos. explore job possibilities and join our team to help us achieve it. great people deserve great benefits continuous learning, mentorship, career growth flexibility, remote work for well-being you know what you need to do and when it needs to be it’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people. hybrid | istanbul, turkey hybrid | istanbul, turkey the powerful cpu has 1 us cycle time. also it has ethernet/ip communication for data collecting. io-box exp-an module has 24 isolated analog inputs. io-box exp-io module has 16 isolated digital inputs and 16 isolated digital output. after training the model on digilab easily, just needed to click the “upload” button to run the real-time inferencing on dl-box that is edge ai device. dl-box is fully integrated with the no-code ai platf

# Removing Punctuation

In [125]:
import string
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))


print(remove_punc(digieye_textim))

Our Valuable Customer  Partners Our Valuable Customer  Partners Contact us to seek answers for your productrelated questions or schedule demos Explore job possibilities and join our team to help us achieve it Great people deserve great benefits Continuous learning mentorship career growth Flexibility remote work for wellbeing You know what you need to do and when it needs to be It’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people Hybrid  Istanbul Turkey Hybrid  Istanbul Turkey The powerful cpu has 1 us cycle time Also it has EthernetIP communication for data collecting IOBox ExpAN module has 24 isolated analog inputs IOBox ExpIO module has 16 isolated digital inputs and 16 isolated digital output After training the model on DigiLab easily just needed to click the “Upload” button to run the realtime inferencing on DLBox that is edge AI device DLBox is fully integrated with the nocode AI platform DigiLab DLaaS Deep Learni

# Removing Stop Words

In [126]:
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords_english:
            continue
        else:
            new_text.append(word)

    return ' '.join(new_text)


remove_stopwords(digieye_textim) #Contact us to seek answers -> Contact us seek answers

'Our Valuable Customer & Partners Our Valuable Customer & Partners Contact us seek answers product-related questions schedule demos. Explore job possibilities join team help us achieve it. Great people deserve great benefits Continuous learning, mentorship, career growth Flexibility, remote work well-being You know need needs It’s philosophy freely sharing information workplace way benefits organization people. Hybrid | Istanbul, Turkey Hybrid | Istanbul, Turkey The powerful cpu 1 us cycle time. Also Ethernet/IP communication data collecting. IO-Box Exp-AN module 24 isolated analog inputs. IO-Box Exp-IO module 16 isolated digital inputs 16 isolated digital output. After training model DigiLab easily, needed click “Upload” button run real-time inferencing DL-Box edge AI device. DL-Box fully integrated no-code AI platform DigiLab. DLaaS (Deep Learning Service) new service methods AI tasks. DL-Box used API service server devices. DL-Box “Edge AI Device” equipped powerful Nvidia GPUs CPUs 

# Tokenization

In [127]:
import spacy 
nlp = spacy.load("en_core_web_sm")


def Tokenize_Text(text):
    tokenized_words = []
    doc = nlp(text)
    for token in doc:
        tokenized_words.append(token.text)
    return ' '.join(tokenized_words)

Tokenize_Text(digieye_textim)

'Our Valuable Customer & Partners Our Valuable Customer & Partners Contact us to seek answers for your product - related questions or schedule demos . Explore job possibilities and join our team to help us achieve it . Great people deserve great benefits Continuous learning , mentorship , career growth Flexibility , remote work for \u2028 well - being You know what you need to do and when it needs to be It ’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people . Hybrid | \xa0 Istanbul , Turkey Hybrid | \xa0 Istanbul , Turkey The powerful cpu has 1 us cycle time . Also it has Ethernet / IP communication for data collecting . IO - Box Exp - AN module has 24 isolated analog inputs . IO - Box Exp - IO module has 16 isolated digital inputs and 16 isolated digital output . After training the model on DigiLab easily , just needed to click the “ Upload ” button to run the real - time inferencing on DL - Box that is edge AI device .

# Lemmatizing

In [128]:
import spacy 
nlp = spacy.load("en_core_web_sm")


def Lemmatize(text):
    lemmatized_words = []
    doc = nlp(text)
    for token in doc:
        lemmatized_words.append(token.lemma_)
    return ' '.join(lemmatized_words)
        
Lemmatize(digieye_textim)

'our Valuable Customer & Partners our Valuable Customer & Partners contact we to seek answer for your product - relate question or schedule demo . explore job possibility and join our team to help we achieve it . great people deserve great benefit Continuous learning , mentorship , career growth flexibility , remote work for \u2028 well - being you know what you need to do and when it need to be it ’ a philosophy of freely share information in the workplace in a way that benefit the organization and its people . hybrid | \xa0 Istanbul , Turkey Hybrid | \xa0 Istanbul , Turkey the powerful cpu have 1 us cycle time . also it have Ethernet / IP communication for datum collect . IO - Box Exp - AN module have 24 isolated analog input . IO - Box Exp - IO module have 16 isolate digital input and 16 isolate digital output . after train the model on DigiLab easily , just need to click the " Upload " button to run the real - time inference on DL - Box that be edge AI device . DL - Box be fully in

# Stemming

In [129]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def apply_stemming(text):
    new_text = [ps.stem(word) for word in text.split()]
    return ' '.join(new_text)


apply_stemming(digieye_textim)  # Lemmatizing gives better output than Stemming

'our valuabl custom & partner our valuabl custom & partner contact us to seek answer for your product-rel question or schedul demos. explor job possibl and join our team to help us achiev it. great peopl deserv great benefit continu learning, mentorship, career growth flexibility, remot work for well-b you know what you need to do and when it need to be it’ a philosophi of freeli share inform in the workplac in a way that benefit the organ and it people. hybrid | istanbul, turkey hybrid | istanbul, turkey the power cpu ha 1 us cycl time. also it ha ethernet/ip commun for data collecting. io-box exp-an modul ha 24 isol analog inputs. io-box exp-io modul ha 16 isol digit input and 16 isol digit output. after train the model on digilab easily, just need to click the “upload” button to run the real-tim inferenc on dl-box that is edg ai device. dl-box is fulli integr with the no-cod ai platform digilab. dlaa (deep learn as a service) is a new servic method for ai tasks. dl-box can be use as

# Customization

In [130]:
import spacy

nlp = spacy.load("en_core_web_sm") #Spacy language model

def add_custom_lemmas(nlp, word_mappings, target_lemma):
    
    # nlp-> Spacy language model object
    # word_mappings (list) -> word list for lemmatizing
    # target lemma (str) -> last word after lemmatizing
    
    ar = nlp.get_pipe('attribute_ruler')
    
    for word in word_mappings:
        ar.add([[{"TEXT": word}]], {"LEMMA": target_lemma})


word_mappings = ["Bro", "Brah", "Broski"]
target_lemma = "Brother"


add_custom_lemmas(nlp, word_mappings, target_lemma)

# Test edelim
doc = nlp("Yo, what's up Bro? How's it going Brah and Broski?")
for token in doc:
    print(f"{token.text} -> {token.lemma_}")

Yo -> Yo
, -> ,
what -> what
's -> be
up -> up
Bro -> Brother
? -> ?
How -> how
's -> be
it -> it
going -> go
Brah -> Brother
and -> and
Broski -> Brother
? -> ?


In [13]:
print(webden_texti_cek("https://ritimus.com/")) #Zaim teknoparktaki başka bir şirket

Aileler olarak çoğu zaman çocuklarımız tarafından teknolojik aletler başta olmak üzere birçok alışveriş baskısı altında kalabiliyoruz. Bu baskıyla almak durumunda kaldığımız ürünlerin başında akıllı saatler geliyor.   Peki ne işe yarar bu akıllı saatler, bize bir faydası var mı? Teknolojik imkanları doğru kullandığımız takdirde bize hizmet eden ürünlere dönüşür.  RitimUS.com ve sosyal medya üzerinden bizi takip edebilir, eğitim içeriklerimize ulaşabilirsiniz.   Daha sonraki yorumlarımda kullanılması için adım, e-posta adresim ve site adresim bu tarayıcıya kaydedilsin. Eğitici zeka oyunları RitimUS’u Google Play ve App Store’dan ücretsiz bir şekilde indirip oynamaya başlayın.  Matematiksel Zeka Oyunları  ile rakamların dünyasına yolculuk yapıyor, matematiğin sevilmeyen bir ders olma tabusunu oyunlaştırma stratejisi ile değiştiriyoruz.  Soyut işlemler içerisinde yer alan matematiğin RitimUS oyunlarıyla somut hale getirilmesi hiç şüphesiz bu öğrenmeyi kolaylaştırmaktadır. Görsel Zeka Oyun

In [131]:
digieye_textim = webden_texti_cek("https://digieye.ai/")
ritimus_textim = webden_texti_cek("https://ritimus.com/")

In [132]:
from nltk.tokenize import sent_tokenize

sent_tokenize(digieye_textim) 

['Our Valuable Customer & Partners Our Valuable Customer & Partners Contact us to seek answers for your product-related questions or schedule demos.',
 'Explore job possibilities and join our team to help us achieve it.',
 'Great people deserve great benefits Continuous learning, mentorship, career growth Flexibility, remote work for\u2028well-being You know what you need to do and when it needs to be It’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people.',
 'Hybrid |\xa0Istanbul, Turkey Hybrid |\xa0Istanbul, Turkey The powerful cpu has 1 us cycle time.',
 'Also it has Ethernet/IP communication for data collecting.',
 'IO-Box Exp-AN module has 24 isolated analog inputs.',
 'IO-Box Exp-IO module has 16 isolated digital inputs and 16 isolated digital output.',
 'After training the model on DigiLab easily, just needed to click the “Upload” button to run the real-time inferencing on DL-Box that is edge AI device.',
 'DL-Box 

# Appliying All Steps To Our Data

In [133]:
# Web Scraping

digieyeTextim = webden_texti_cek("https://digieye.ai/")
digieyeTextimForTesting = webden_texti_cek("https://digieye.ai/")

In [134]:
# Lowercasing 

digieyeTextimForTesting = lowercasing(digieyeTextimForTesting) 
digieyeTextimForTesting

'our valuable customer & partners our valuable customer & partners contact us to seek answers for your product-related questions or schedule demos. explore job possibilities and join our team to help us achieve it. great people deserve great benefits continuous learning, mentorship, career growth flexibility, remote work for\u2028well-being you know what you need to do and when it needs to be it’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people. hybrid |\xa0istanbul, turkey hybrid |\xa0istanbul, turkey the powerful cpu has 1 us cycle time. also it has ethernet/ip communication for data collecting. io-box exp-an module has 24 isolated analog inputs. io-box exp-io module has 16 isolated digital inputs and 16 isolated digital output. after training the model on digilab easily, just needed to click the “upload” button to run the real-time inferencing on dl-box that is edge ai device. dl-box is fully integrated with the no-c

In [135]:
# Removing Punctuation

digieyeTextimForTesting = remove_punc(digieyeTextimForTesting)
digieyeTextimForTesting

'our valuable customer  partners our valuable customer  partners contact us to seek answers for your productrelated questions or schedule demos explore job possibilities and join our team to help us achieve it great people deserve great benefits continuous learning mentorship career growth flexibility remote work for\u2028wellbeing you know what you need to do and when it needs to be it’s a philosophy of freely sharing information in the workplace in a way that benefits the organization and its people hybrid \xa0istanbul turkey hybrid \xa0istanbul turkey the powerful cpu has 1 us cycle time also it has ethernetip communication for data collecting iobox expan module has 24 isolated analog inputs iobox expio module has 16 isolated digital inputs and 16 isolated digital output after training the model on digilab easily just needed to click the “upload” button to run the realtime inferencing on dlbox that is edge ai device dlbox is fully integrated with the nocode ai platform digilab dlaas

In [136]:
# Removing Stopwords

digieyeTextimForTesting = remove_stopwords(digieyeTextimForTesting)
digieyeTextimForTesting

'valuable customer partners valuable customer partners contact us seek answers productrelated questions schedule demos explore job possibilities join team help us achieve great people deserve great benefits continuous learning mentorship career growth flexibility remote work wellbeing know need needs it’s philosophy freely sharing information workplace way benefits organization people hybrid istanbul turkey hybrid istanbul turkey powerful cpu 1 us cycle time also ethernetip communication data collecting iobox expan module 24 isolated analog inputs iobox expio module 16 isolated digital inputs 16 isolated digital output training model digilab easily needed click “upload” button run realtime inferencing dlbox edge ai device dlbox fully integrated nocode ai platform digilab dlaas deep learning service new service methods ai tasks dlbox used api service server devices dlbox “edge ai device” equipped powerful nvidia gpus cpus allow run image processing artificial intelligence applications r

In [137]:
# Tokenization

digieyeTextimForTesting = Tokenize_Text(digieyeTextimForTesting)
digieyeTextimForTesting

'valuable customer partners valuable customer partners contact us seek answers productrelated questions schedule demos explore job possibilities join team help us achieve great people deserve great benefits continuous learning mentorship career growth flexibility remote work wellbeing know need needs it ’s philosophy freely sharing information workplace way benefits organization people hybrid istanbul turkey hybrid istanbul turkey powerful cpu 1 us cycle time also ethernetip communication data collecting iobox expan module 24 isolated analog inputs iobox expio module 16 isolated digital inputs 16 isolated digital output training model digilab easily needed click “ upload ” button run realtime inferencing dlbox edge ai device dlbox fully integrated nocode ai platform digilab dlaas deep learning service new service methods ai tasks dlbox used api service server devices dlbox “ edge ai device ” equipped powerful nvidia gpus cpus allow run image processing artificial intelligence applicati

In [138]:
# Lemmatization

digieyeTextimForTesting = Lemmatize(digieyeTextimForTesting)
digieyeTextimForTesting

'valuable customer partner valuable customer partner contact we seek answer productrelate question schedule demos explore job possibility join team help we achieve great people deserve great benefit continuous learn mentorship career growth flexibility remote work wellbeing know need need it ’ philosophy freely share information workplace way benefit organization people hybrid istanbul turkey hybrid istanbul turkey powerful cpu 1 us cycle time also ethernetip communication datum collect iobox expan module 24 isolated analog input iobox expio module 16 isolate digital input 16 isolate digital output training model digilab easily need click " upload " button run realtime inference dlbox edge ai device dlbox fully integrate nocode ai platform digilab dlaas deep learn service new service method ai task dlbox use api service server device dlbox " edge ai device " equip powerful nvidia gpus cpus allow run image process artificial intelligence application realtime digieye found powered knowle

In [139]:
#Stringi Listeye Çevirmek

words_list = digieyeTextimForTesting.split()
words_list

['valuable',
 'customer',
 'partner',
 'valuable',
 'customer',
 'partner',
 'contact',
 'we',
 'seek',
 'answer',
 'productrelate',
 'question',
 'schedule',
 'demos',
 'explore',
 'job',
 'possibility',
 'join',
 'team',
 'help',
 'we',
 'achieve',
 'great',
 'people',
 'deserve',
 'great',
 'benefit',
 'continuous',
 'learn',
 'mentorship',
 'career',
 'growth',
 'flexibility',
 'remote',
 'work',
 'wellbeing',
 'know',
 'need',
 'need',
 'it',
 '’',
 'philosophy',
 'freely',
 'share',
 'information',
 'workplace',
 'way',
 'benefit',
 'organization',
 'people',
 'hybrid',
 'istanbul',
 'turkey',
 'hybrid',
 'istanbul',
 'turkey',
 'powerful',
 'cpu',
 '1',
 'us',
 'cycle',
 'time',
 'also',
 'ethernetip',
 'communication',
 'datum',
 'collect',
 'iobox',
 'expan',
 'module',
 '24',
 'isolated',
 'analog',
 'input',
 'iobox',
 'expio',
 'module',
 '16',
 'isolate',
 'digital',
 'input',
 '16',
 'isolate',
 'digital',
 'output',
 'training',
 'model',
 'digilab',
 'easily',
 'need',
