#Fundamentals of NLP I Tokenization & Lemmatization

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords

In [4]:
df = pd.read_csv(r"D:\Datasets\Datasets\Tweets.csv")
df = df.drop(["textID", "selected_text"], axis = 1)

In [5]:
tt = df["text"][0]

# Tokenization

In [6]:
wt = word_tokenize(tt)
print(wt)

['I', '`', 'd', 'have', 'responded', ',', 'if', 'I', 'were', 'going']


In [7]:
st = sent_tokenize(tt)
print(st)

[' I`d have responded, if I were going']


In [8]:
tbwt = TreebankWordTokenizer()
print(tbwt.tokenize(tt))

['I`d', 'have', 'responded', ',', 'if', 'I', 'were', 'going']


In [9]:
pst = PunktSentenceTokenizer()
print(pst.tokenize(tt))

[' I`d have responded, if I were going']


In [10]:
mwe = MWETokenizer()
print(mwe.tokenize(word_tokenize(tt)))

['I', '`', 'd', 'have', 'responded', ',', 'if', 'I', 'were', 'going']


In [11]:
text_1 = "Tokenization is a fundamental process in natural language processing (NLP) that involves breaking down a text into individual units called tokens. These tokens can be words, subwords, or even characters, depending on the level of granularity required for a particular application. The goal of tokenization is to create a structured and manageable representation of textual data, enabling machines to better understand and process language. This process is crucial in various NLP tasks, such as machine translation, text classification, and sentiment analysis, where the input data needs to be converted into a format suitable for analysis and modeling. Tokenization facilitates the extraction of meaningful information from text, providing a foundation for subsequent linguistic analysis and computational understanding of language patterns."

# Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
wnt = WordNetLemmatizer()

print(wnt.lemmatize(str(word_tokenize(text_1))))

['Tokenization', 'is', 'a', 'fundamental', 'process', 'in', 'natural', 'language', 'processing', '(', 'NLP', ')', 'that', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'units', 'called', 'tokens', '.', 'These', 'tokens', 'can', 'be', 'words', ',', 'subwords', ',', 'or', 'even', 'characters', ',', 'depending', 'on', 'the', 'level', 'of', 'granularity', 'required', 'for', 'a', 'particular', 'application', '.', 'The', 'goal', 'of', 'tokenization', 'is', 'to', 'create', 'a', 'structured', 'and', 'manageable', 'representation', 'of', 'textual', 'data', ',', 'enabling', 'machines', 'to', 'better', 'understand', 'and', 'process', 'language', '.', 'This', 'process', 'is', 'crucial', 'in', 'various', 'NLP', 'tasks', ',', 'such', 'as', 'machine', 'translation', ',', 'text', 'classification', ',', 'and', 'sentiment', 'analysis', ',', 'where', 'the', 'input', 'data', 'needs', 'to', 'be', 'converted', 'into', 'a', 'format', 'suitable', 'for', 'analysis', 'and', 'modeling', '.',

# Fundamentals of NLP II Stemming & Sentence Segmentation

In [13]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print(ps.stem(str(word_tokenize(text_1))))

['tokenization', 'is', 'a', 'fundamental', 'process', 'in', 'natural', 'language', 'processing', '(', 'nlp', ')', 'that', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'units', 'called', 'tokens', '.', 'these', 'tokens', 'can', 'be', 'words', ',', 'subwords', ',', 'or', 'even', 'characters', ',', 'depending', 'on', 'the', 'level', 'of', 'granularity', 'required', 'for', 'a', 'particular', 'application', '.', 'the', 'goal', 'of', 'tokenization', 'is', 'to', 'create', 'a', 'structured', 'and', 'manageable', 'representation', 'of', 'textual', 'data', ',', 'enabling', 'machines', 'to', 'better', 'understand', 'and', 'process', 'language', '.', 'this', 'process', 'is', 'crucial', 'in', 'various', 'nlp', 'tasks', ',', 'such', 'as', 'machine', 'translation', ',', 'text', 'classification', ',', 'and', 'sentiment', 'analysis', ',', 'where', 'the', 'input', 'data', 'needs', 'to', 'be', 'converted', 'into', 'a', 'format', 'suitable', 'for', 'analysis', 'and', 'modeling', '.',

In [14]:
from nltk.stem import SnowballStemmer
sbs = SnowballStemmer("english")

print(sbs.stem(str(word_tokenize(text_1))))

['tokenization', 'is', 'a', 'fundamental', 'process', 'in', 'natural', 'language', 'processing', '(', 'nlp', ')', 'that', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'units', 'called', 'tokens', '.', 'these', 'tokens', 'can', 'be', 'words', ',', 'subwords', ',', 'or', 'even', 'characters', ',', 'depending', 'on', 'the', 'level', 'of', 'granularity', 'required', 'for', 'a', 'particular', 'application', '.', 'the', 'goal', 'of', 'tokenization', 'is', 'to', 'create', 'a', 'structured', 'and', 'manageable', 'representation', 'of', 'textual', 'data', ',', 'enabling', 'machines', 'to', 'better', 'understand', 'and', 'process', 'language', '.', 'this', 'process', 'is', 'crucial', 'in', 'various', 'nlp', 'tasks', ',', 'such', 'as', 'machine', 'translation', ',', 'text', 'classification', ',', 'and', 'sentiment', 'analysis', ',', 'where', 'the', 'input', 'data', 'needs', 'to', 'be', 'converted', 'into', 'a', 'format', 'suitable', 'for', 'analysis', 'and', 'modeling', '.',

In [15]:
from nltk.stem import RegexpStemmer
res = RegexpStemmer('ing$|s$|ed$')

print(res.stem(str(word_tokenize(text_1))))

['Tokenization', 'is', 'a', 'fundamental', 'process', 'in', 'natural', 'language', 'processing', '(', 'NLP', ')', 'that', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'units', 'called', 'tokens', '.', 'These', 'tokens', 'can', 'be', 'words', ',', 'subwords', ',', 'or', 'even', 'characters', ',', 'depending', 'on', 'the', 'level', 'of', 'granularity', 'required', 'for', 'a', 'particular', 'application', '.', 'The', 'goal', 'of', 'tokenization', 'is', 'to', 'create', 'a', 'structured', 'and', 'manageable', 'representation', 'of', 'textual', 'data', ',', 'enabling', 'machines', 'to', 'better', 'understand', 'and', 'process', 'language', '.', 'This', 'process', 'is', 'crucial', 'in', 'various', 'NLP', 'tasks', ',', 'such', 'as', 'machine', 'translation', ',', 'text', 'classification', ',', 'and', 'sentiment', 'analysis', ',', 'where', 'the', 'input', 'data', 'needs', 'to', 'be', 'converted', 'into', 'a', 'format', 'suitable', 'for', 'analysis', 'and', 'modeling', '.',

In [16]:
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print(ls.stem(str(word_tokenize(text_1))))

['tokenization', 'is', 'a', 'fundamental', 'process', 'in', 'natural', 'language', 'processing', '(', 'nlp', ')', 'that', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'individual', 'units', 'called', 'tokens', '.', 'these', 'tokens', 'can', 'be', 'words', ',', 'subwords', ',', 'or', 'even', 'characters', ',', 'depending', 'on', 'the', 'level', 'of', 'granularity', 'required', 'for', 'a', 'particular', 'application', '.', 'the', 'goal', 'of', 'tokenization', 'is', 'to', 'create', 'a', 'structured', 'and', 'manageable', 'representation', 'of', 'textual', 'data', ',', 'enabling', 'machines', 'to', 'better', 'understand', 'and', 'process', 'language', '.', 'this', 'process', 'is', 'crucial', 'in', 'various', 'nlp', 'tasks', ',', 'such', 'as', 'machine', 'translation', ',', 'text', 'classification', ',', 'and', 'sentiment', 'analysis', ',', 'where', 'the', 'input', 'data', 'needs', 'to', 'be', 'converted', 'into', 'a', 'format', 'suitable', 'for', 'analysis', 'and', 'modeling', '.',

# Sentence Sengementation

In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

text_2 = nlp(text_1)

for sentence in text_2.sents:
    print(sentence)

Tokenization is a fundamental process in natural language processing (NLP) that involves breaking down a text into individual units called tokens.
These tokens can be words, subwords, or even characters, depending on the level of granularity required for a particular application.
The goal of tokenization is to create a structured and manageable representation of textual data, enabling machines to better understand and process language.
This process is crucial in various NLP tasks, such as machine translation, text classification, and sentiment analysis, where the input data needs to be converted into a format suitable for analysis and modeling.
Tokenization facilitates the extraction of meaningful information from text, providing a foundation for subsequent linguistic analysis and computational understanding of language patterns.


# NLP using Scikit Library

# Random Forest Classifier & SVM

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df["sentiment"] = le.fit_transform(df["sentiment"])

In [19]:
df = df.dropna()

In [20]:
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


In [21]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [22]:
df["clean_text"] = df["text"].apply(preprocess)

In [23]:
df.shape

(27480, 3)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cv = CountVectorizer()

x = cv.fit_transform(df["clean_text"][:5000]).toarray()
y = df["sentiment"][:5000]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [28]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

In [29]:
rfc_prediction = rfc.predict(x_test)

accuracy = accuracy_score(y_test, rfc_prediction)
print(accuracy)

0.408


In [30]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(x_train, y_train)

In [31]:
svm_prediction = svm.predict(x_test)
accuracy = accuracy_score(y_test, svm_prediction)
print(accuracy)

0.408


In [32]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_train, y_train)

In [33]:
mnb_prediction = mnb.predict(x_test)

accuracy = accuracy_score(y_test, mnb_prediction)
print(accuracy)

0.408


In [34]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [35]:
gnb_prediction = gnb.predict(x_test)

accuracy = accuracy_score(y_test, gnb_prediction)
print(accuracy)

0.267
