Tokenize the string text after removing punctuations

In [32]:
from nltk.tokenize import word_tokenize
import re
def tokenization(text):
    # REMOVING PUNCTUATIONS
    inputText = re.sub(r'[^\w\s]','',text)
    inputText = re.sub(r'\d+', '', inputText)
    lowerString = inputText.lower()
    wordToknized = word_tokenize(lowerString)
    return wordToknized

Removing stopwords

In [5]:
from nltk.corpus import stopwords

def removeStopwords(tokens):
    wordList = stopwords.words('english')
    filtered_words = [word for word in tokens if word not in wordList]
    
    return filtered_words

Lemmatizing the text

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

def wordlemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    noun = ['NN', 'NNS', 'NNP', 'NNPS']
    verb = ['VB','VBD','VBG','VBN','VBP','VBZ']
    adverb = ['RB', 'RBR', 'RBS' ]
    adjective = ['JJ','JJR','JJS']
    for word in tokens:
        postTag = dict(pos_tag([word]))
        if postTag[word] in noun:
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='n'))
        elif postTag[word] in verb:
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))
        elif postTag[word] in adjective:
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='a'))
        elif postTag[word] in adverb:
            lemmatized_words.append(lemmatizer.lemmatize(word, pos='r'))
        else:
            lemmatized_words.append(lemmatizer.lemmatize(word))
    
    return lemmatized_words

Check spelling of the text

In [7]:
from textblob import TextBlob

def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

Detect Language using langID

In [44]:
import langid
from langcodes import Language

def detect_language_langID(text):
    detected_languages = {}
    for word in text:
        language = langid.classify(word)[0]
        detected_languages[word] = Language.get(language).display_name()
        print(word, ':',detected_languages[word])

    return detected_languages

Count english words

In [39]:
def counEnglishwords(text):
    engCount = 0
    length = len(text)
    for word, lang in text.items():
        if lang == 'English':
            engCount +=1
    total_per = (engCount/length)*100

    return total_per

Pipeline to perform textpreprocessing

In [14]:
# text = "In this text, the first sentence is in English, the second sentence is in Japanese, the third sentence includes a Chinese greeting"

# def main():
#     pipeline_functions = [correct_spelling, tokenization, removeStopwords, wordlemmatize, detect_language_langID, counEnglishwords]

#     processed_data = text
#     for func in pipeline_functions:
#         processed_data = func(processed_data)

#     final_result = processed_data
#     print(final_result)

text : English
first : English
sentence : English
english : Italian
second : English
sentence : English
japanese : English
third : English
sentence : English
include : English
chinese : English
greet : English
9
10
90.0
90.0


In [15]:
import pandas as pd

df = pd.read_csv(r"C:\Users\prabh\OneDrive\Documents\Lambton\Semester 2\Natural Language Processing\reviews.csv")

In [16]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,position,midb_id,movie,spoilers,rating,title,user,date,content
0,0,0,tt0111161,The Shawshank Redemption,False,1/10,Pale imitation of better films,sbaradell,10 July 2003,"Three words: ""Cool Hand Luke."" Same film, don..."
1,1,0,tt0111161,The Shawshank Redemption,False,1/10,Didactic and overlong,arthur_pewtey,19 November 2000,Another one of those overlong morally right-on...


In [17]:
newDf = df['content'][0:4]

In [30]:
processed = list(newDf)
processed[0]

'Three words: "Cool Hand Luke."  Same film, done better, done earlier.  For that matter, is this film any better than other Steven King "novelettes" such as "Stand By Me"? All in all, it probably ranks a 6 or a 7, but since people on this site have lost their minds as regards this film, I give it a 1 in one man\'s attempt at sanity.'

In [45]:
pipeline_functions = [correct_spelling, tokenization, removeStopwords, wordlemmatize, detect_language_langID, counEnglishwords]

#processed_data = processed[0]
processed_data = "犬 árbol 世界 你好 monde chien árbol كلب 树 cão bonjour mundo мир hund привет hola cão perro bonjour albero ciao árvore albero árbol chien árvore 你好 baum cão mundo قطة cane 你好 hund hola شجرة arbre мир welt кошка"
for func in pipeline_functions:
    processed_data = func(processed_data)

final_result = processed_data
print('Final Processed Text has {} % of english words'.format(final_result))

犬 : Chinese
árbol : Hungarian
mode : English
chief : English
árbol : Hungarian
كلب : Arabic
树 : Chinese
co : English
bonjour : English
undo : Spanish
мир : Russian
hand : English
привет : Bulgarian
hold : English
co : English
perry : English
bonjour : English
albert : English
árvore : Hungarian
albert : English
árbol : Hungarian
chief : English
árvore : Hungarian
raum : German
co : English
undo : Spanish
قطة : Arabic
cane : English
hand : English
hold : English
شجرة : Arabic
мир : Russian
welt : German
кошка : Serbian
Final Processed Text has 40.909090909090914 % of english words
