# import libraries 

In [1]:
import nltk
#to open csv file
import csv
import pandas as pd
#sentences & words tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
#regular expression 
import re
#for stopwords
from nltk.corpus import stopwords
import string
#n-grams
from nltk.util import ngrams
#Lemmatization & Stemming
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
#for preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# csv path

In [9]:
dataset_path = r'C:\Users\DELL\OneDrive\code\py\nlp\NERdataset\NER-dataset.csv'

In [10]:
dataset = pd.read_csv(dataset_path, encoding="ISO-8859-1")
#UnicodeDecodeError: 'utf-8' codec can't decode byte 0x85 in position 25560: invalid start byte

# number of sentences and words

In [11]:
#number of sentences 
sentences = dataset['Sentence'].count()

#number of words
words = dataset['Word'].count()

print(f"Number of sentences: {sentences}")
print(f"Number of words: {words}")

Number of sentences: 457
Number of words: 10014


# number of words that ending with ing/ed

In [12]:
# find all words that end with ing
ingwords = dataset[dataset['Word'].str.contains(r'\b\w+ing\b')]['Word'].count()
edwords = dataset[dataset['Word'].str.contains(r'\b\w+ed\b')]['Word'].count()

print(f'ing words are {ingwords}')
print(f'ed words are {edwords}')

ing words are 252
ed words are 439


# number of stopwords and punctuation

In [13]:
stop_words = set(stopwords.words('english'))
#punctuation = set(string.punctuation)
punctuation = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

In [14]:
num_stopwords = 0
num_punctuation = 0

for i in dataset['Word']:
    if i in stop_words:
        num_stopwords += 1
    elif i in string.punctuation:
        num_punctuation += 1

In [15]:
print("Number of stopwords:", num_stopwords)
print(stop_words)
print("Number of punctuation:", num_punctuation)
print(punctuation)

Number of stopwords: 3184
{'been', 'myself', 'didn', 'because', 'are', 'were', 'they', 'but', 'weren', 'd', 'most', 've', 'just', 'isn', "doesn't", 'now', 'same', 'her', 'having', 'mustn', 'needn', 'm', 'below', 'am', 'how', 'that', "don't", 'not', 'a', 'he', "shan't", 'once', 'ourselves', 'll', 'some', "mustn't", 'is', 'any', 'me', "didn't", 'yours', 'mightn', 'was', 'when', 'we', 'own', 'she', "wouldn't", 'whom', "shouldn't", "hadn't", 'why', 're', "wasn't", "it's", 'further', 'few', "haven't", 'these', 'i', 'about', 'doesn', 'with', 'so', "you'd", 'be', 'there', 'out', 'yourselves', 'his', 'here', 'had', 'through', 'has', 'by', 'who', 'the', 'herself', 'them', 'o', 'him', 'nor', 'won', "isn't", 'only', 'and', "you'll", 'after', 'aren', 'hadn', "won't", 'doing', 't', 'such', 'our', 'off', 'don', 'or', 'do', "couldn't", 'couldn', 'itself', 'have', 'shan', 'between', 'as', 'will', 'themselves', 'from', 'ours', "should've", "aren't", 'against', 'your', 'hasn', 'other', "hasn't", 'ma', '

# n-grams

In [16]:
n = 3 
ngrams = []

In [17]:
for i in range(len(dataset['Word']) - n + 1):
    ngram = ' '.join(dataset['Word'][i:i+n])
    ngrams.append(ngram)

for ngram in ngrams:
    print([ngram])

['Thousands of demonstrators']
['of demonstrators have']
['demonstrators have marched']
['have marched through']
['marched through London']
['through London to']
['London to protest']
['to protest the']
['protest the war']
['the war in']
['war in Iraq']
['in Iraq and']
['Iraq and demand']
['and demand the']
['demand the withdrawal']
['the withdrawal of']
['withdrawal of British']
['of British troops']
['British troops from']
['troops from that']
['from that country']
['that country .']
['country . Families']
['. Families of']
['Families of soldiers']
['of soldiers killed']
['soldiers killed in']
['killed in the']
['in the conflict']
['the conflict joined']
['conflict joined the']
['joined the protesters']
['the protesters who']
['protesters who carried']
['who carried banners']
['carried banners with']
['banners with such']
['with such slogans']
['such slogans as']
['slogans as "']
['as " Bush']
['" Bush Number']
['Bush Number One']
['Number One Terrorist']
['One Terrorist "']
['Terror

# preprocessing dataset

## Lowercasing:
#### Convert all text to lowercase. This ensures that the model doesn't treat "Apple" and "apple" as different entities.

In [18]:
dataset['Word'] = dataset['Word'].str.lower()
print(dataset['Word'])

0            thousands
1                   of
2        demonstrators
3                 have
4              marched
             ...      
10009            roles
10010               in
10011              the
10012         killings
10013                .
Name: Word, Length: 10014, dtype: object


## Tokenization:
#### Tokenize your text into words or subword units. This breaks down the text into individual units, making it easier for the model to understand.

In [19]:
def tokenize_text(word):
    return word_tokenize(word)

dataset['Tokenized'] = dataset['Word'].apply(tokenize_text)
print(dataset[['Word', 'Tokenized']])

                Word        Tokenized
0          thousands      [thousands]
1                 of             [of]
2      demonstrators  [demonstrators]
3               have           [have]
4            marched        [marched]
...              ...              ...
10009          roles          [roles]
10010             in             [in]
10011            the            [the]
10012       killings       [killings]
10013              .              [.]

[10014 rows x 2 columns]


In [20]:
dataset = dataset[dataset['Word'] != '']

## Removing Stopwords:
#### Remove common stopwords as they usually do not carry much information for NER tasks.

In [21]:
#remove stopwords and replace with an empty string
def remove_stopwords(word):

    return word if word not in stop_words else ''

dataset['Word'] = dataset['Word'].apply(remove_stopwords)
print(dataset['Word'][dataset['Word'] != ''])

0            thousands
2        demonstrators
4              marched
6               london
8              protest
             ...      
10005            serbs
10008              key
10009            roles
10012         killings
10013                .
Name: Word, Length: 6597, dtype: object


## Removing Punctuation:
#### Depending on your task, you might want to remove or replace punctuation. In NER, punctuation may not contribute much to identifying entities.

In [22]:
def remove_punctuation(word):

    return word if word not in punctuation else ''

dataset['Word'] = dataset['Word'].apply(remove_punctuation)

print(dataset['Word'][dataset['Word'] != ''])

0            thousands
2        demonstrators
4              marched
6               london
8              protest
             ...      
10004               14
10005            serbs
10008              key
10009            roles
10012         killings
Name: Word, Length: 5801, dtype: object


## Lemmatization or Stemming:
#### Reduce words to their base or root form. This helps in reducing dimensionality and treating different forms of a word as the same.

## Stemming

In [23]:
def stem_word(word):
    if word is not None:
        stemmer = PorterStemmer()
        return stemmer.stem(word)

dataset['Stemmed'] = dataset['Word'].apply(stem_word)

print(dataset[['Word','Stemmed']])

                Word   Stemmed
0          thousands  thousand
1                             
2      demonstrators  demonstr
3                             
4            marched     march
...              ...       ...
10009          roles      role
10010                         
10011                         
10012       killings      kill
10013                         

[10014 rows x 2 columns]


## Lemmatization

In [24]:
def lemmatize_word(word):
    if word is not None:
        lemmatizer = WordNetLemmatizer()
        return lemmatizer.lemmatize(word)
    
dataset['Lemmatized'] = dataset['Word'].apply(lemmatize_word)
print(dataset[['Word','Lemmatized']])

                Word    Lemmatized
0          thousands      thousand
1                                 
2      demonstrators  demonstrator
3                                 
4            marched       marched
...              ...           ...
10009          roles          role
10010                             
10011                             
10012       killings       killing
10013                             

[10014 rows x 2 columns]


# n-gram

In [25]:
dataset = dataset[dataset['Word'] != '']
n = 3
ngrams = []

In [26]:
for i in range(len(dataset['Word']) - n + 1):
    ngram = ' '.join(dataset['Word'][i:i+n])
    ngrams.append(ngram)

for ngram in ngrams:
    print([ngram])

['thousands demonstrators marched']
['demonstrators marched london']
['marched london protest']
['london protest war']
['protest war iraq']
['war iraq demand']
['iraq demand withdrawal']
['demand withdrawal british']
['withdrawal british troops']
['british troops country']
['troops country families']
['country families soldiers']
['families soldiers killed']
['soldiers killed conflict']
['killed conflict joined']
['conflict joined protesters']
['joined protesters carried']
['protesters carried banners']
['carried banners slogans']
['banners slogans bush']
['slogans bush number']
['bush number one']
['number one terrorist']
['one terrorist stop']
['terrorist stop bombings']
['stop bombings marched']
['bombings marched houses']
['marched houses parliament']
['houses parliament rally']
['parliament rally hyde']
['rally hyde park']
['hyde park police']
['park police put']
['police put number']
['put number marchers']
['number marchers 10,000']
['marchers 10,000 organizers']
['10,000 organi

# POS

In [27]:
print(nltk.pos_tag(dataset['Word'].dropna()))#to remove none values

[('thousands', 'NNS'), ('demonstrators', 'NNS'), ('marched', 'VBD'), ('london', 'JJ'), ('protest', 'JJ'), ('war', 'NN'), ('iraq', 'JJ'), ('demand', 'NN'), ('withdrawal', 'NN'), ('british', 'JJ'), ('troops', 'NNS'), ('country', 'NN'), ('families', 'NNS'), ('soldiers', 'NNS'), ('killed', 'VBD'), ('conflict', 'NN'), ('joined', 'VBD'), ('protesters', 'NNS'), ('carried', 'VBD'), ('banners', 'NNS'), ('slogans', 'NNS'), ('bush', 'VBP'), ('number', 'NN'), ('one', 'CD'), ('terrorist', 'NN'), ('stop', 'NN'), ('bombings', 'NNS'), ('marched', 'VBD'), ('houses', 'NNS'), ('parliament', 'JJ'), ('rally', 'RB'), ('hyde', 'VBP'), ('park', 'JJ'), ('police', 'NN'), ('put', 'VBD'), ('number', 'NN'), ('marchers', 'NNS'), ('10,000', 'CD'), ('organizers', 'NNS'), ('claimed', 'VBD'), ('1,00,000', 'CD'), ('protest', 'NN'), ('comes', 'VBZ'), ('eve', 'VBP'), ('annual', 'JJ'), ('conference', 'NN'), ('britain', 'NN'), ("'s", 'POS'), ('ruling', 'NN'), ('labor', 'NN'), ('party', 'NN'), ('southern', 'JJ'), ('english',

### Handling Numerical Data:
#### Decide how to handle numerical data, whether to replace numbers with a placeholder or convert them to text.

# NNP words

# NNS words

In [23]:
print(dataset['Word'])

0            thousands
2        demonstrators
4              marched
6               london
8              protest
             ...      
10004               14
10005            serbs
10008              key
10009            roles
10012         killings
Name: Word, Length: 5801, dtype: object


In [24]:
stemmer = PorterStemmer()
print(stemmer.stem('probablistic'))

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('probablistic'))

probablist
probablistic
