In [2]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB


In [3]:
import re
text = "Hello!!! How are you? Call me at 123456."
cleaned_text = re.sub(r'[^\w\s]', '', text)
# Output: 'Hello How are you Call me at 123456'


In [4]:
text = "Hello World!"
lowercase_text = text.lower()
# Output: 'hello world!'


In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "This is a simple example of text processing."
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
filtered_text = [w for w in words if not w in stop_words]
# Output: ['This', 'simple', 'example', 'text', 'processing']


In [6]:
from nltk.tokenize import word_tokenize

text = "Natural Language Processing is fascinating."
tokens = word_tokenize(text)
# Output: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '.']


In [7]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "runs", "runner"]
stemmed_words = [ps.stem(w) for w in words]
# Output: ['run', 'run', 'runner']


In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "ran"]
lemmatized_words = [lemmatizer.lemmatize(w, pos='v') for w in words]
# Output: ['run', 'good', 'run']


In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "ran"]
lemmatized_words = [lemmatizer.lemmatize(w, pos='v') for w in words]
# Output: ['run', 'good', 'run']


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Natural language processing is exciting.", 
        "Language is a powerful tool."]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(text)
print(bow_matrix.toarray())
# Output: [[1 1 1 1 1 0 0 0], [0 1 0 1 0 1 1 1]]


[[1 1 1 1 0 1 0]
 [0 1 1 0 1 0 1]]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = ["Natural language processing is exciting.", 
        "Language is a powerful tool."]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text)
print(tfidf_matrix.toarray())
# Output: [[0.40993715 0.40993715 0.40993715 0.40993715 0.40993715 0.00000000 0.00000000 0.00000000],
#          [0.00000000 0.34369367 0.00000000 0.34369367 0.00000000 0.48546061 0.48546061 0.48546061]]


[[0.49922133 0.35520009 0.35520009 0.49922133 0.         0.49922133
  0.        ]
 [0.         0.40993715 0.40993715 0.         0.57615236 0.
  0.57615236]]


In [12]:
from gensim.models import Word2Vec

sentences = [["natural", "language", "processing"],
             ["language", "is", "a", "powerful", "tool"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vector = model.wv['language']  # Output: متجه يمثل كلمة 'language'


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

text = ["I love natural language processing"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
# Output: [[1, 2, 3, 4, 5]]


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["I love natural language processing"]
vectorizer = CountVectorizer(ngram_range=(2, 2))  # استخراج ثنائي الألفاظ (bigrams)
ngram_matrix = vectorizer.fit_transform(text)
# Output: تمثيل النص باستخدام ثنائي الألفاظ


In [15]:
import re
text = "hellooooo"
cleaned_text = re.sub(r'(.)\1+', r'\1', text)
# Output: 'hello'


In [16]:
text = "مرحبا"
encoded_text = text.encode('utf-8')
# Output: b'\xd9\x85\xd8\xb1\xd8\xad\xd8\xa8\xd8\xa7'
