# DATA CLEANING STEPS FOR NLP

In [1]:
# Required library
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
import re

In [2]:
# Read the text data
elon = pd.read_csv('E:/DATA SCI ASSI/text mining/Elon_musk.csv', encoding='unicode_escape')
elon

Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Im an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...
...,...,...
1994,1995,"@flcnhvy True, it sounds so surreal, but the n..."
1995,1996,@PPathole Make sure to read ur terms &amp; con...
1996,1997,@TeslaGong @PPathole Samwise Gamgee
1997,1998,@PPathole Altho Dumb and Dumber is <U+0001F525...


In [3]:
# clean data ,Removing the punctuations
import string

def pun_removal(messy_str):
  clean_list=[char for char in messy_str if char not in string.punctuation]
  clean_str=''.join(clean_list)
  return clean_str

elon['Text']=elon['Text'].apply(pun_removal)
#check Top 5_Reviews
elon['Text'].head(5)

0                                kunalb11 Im an alien
1    IDAACarmack Ray tracing on Cyberpunk with HDR ...
2                     joerogan Spotify Great interview
3                       gtera27 Doge is underestimated
4    teslacn Congratulations Tesla China for amazin...
Name: Text, dtype: object

In [4]:
import re
# leats reamoving numbers

def drop_num(list_text):
    list_text_new=[]
    for i in list_text:
        if not re.search('\d',i):
            list_text_new.append(i)
    return ''.join(list_text_new)

elon['Text']=elon['Text'].apply(drop_num)
#check Top 5_Reviews
elon['Text'].head(5)

0                                  kunalb Im an alien
1    IDAACarmack Ray tracing on Cyberpunk with HDR ...
2                     joerogan Spotify Great interview
3                         gtera Doge is underestimated
4    teslacn Congratulations Tesla China for amazin...
Name: Text, dtype: object

In [5]:
# Removing the special characters
def remove_special_char(tx):
  pat=r'[^a-zA-z0-9]'
  return re.sub(pat,'',tx)
elon['Text']=elon['Text'].apply(lambda x:remove_special_char(x))

#check Top 5_Reviews
elon['Text'].head(5)

0                                      kunalbImanalien
1    IDAACarmackRaytracingonCyberpunkwithHDRisnextl...
2                        joeroganSpotifyGreatinterview
3                            gteraDogeisunderestimated
4    teslacnCongratulationsTeslaChinaforamazingexec...
Name: Text, dtype: object

In [6]:
# Removing the Accented characters
import unicodedata

def remove_accented_char(tx):
  new_text=unicodedata.normalize('NFKD',tx).encode('ascii','ignore').decode('utf-8','ignore')
  return new_text
elon['Text']=elon['Text'].apply(lambda x:remove_accented_char(x))

#check Top 5_Reviews
elon['Text'].head(5)

0                                      kunalbImanalien
1    IDAACarmackRaytracingonCyberpunkwithHDRisnextl...
2                        joeroganSpotifyGreatinterview
3                            gteraDogeisunderestimated
4    teslacnCongratulationsTeslaChinaforamazingexec...
Name: Text, dtype: object

In [8]:
#removing stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
elon['Text'] = elon['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [9]:
# Calculate count of wards , characters count in review clm
# used 'len' fuction for char_count= length
elon["length"]=elon['Text'].apply(len)
# Words count
elon['Words_count']=elon['Text'].apply(lambda x:len(x.split()))
# Word Density
elon['Word_Density']=elon['length']/(elon['Words_count']+1)

In [10]:
elon[['length','Words_count','Word_Density']].describe()

Unnamed: 0,length,Words_count,Word_Density
count,1999.0,1999.0,1999.0
mean,60.904952,0.9995,30.452476
std,34.770994,0.022366,17.385497
min,0.0,0.0,0.0
25%,30.0,1.0,15.0
50%,53.0,1.0,26.5
75%,101.0,1.0,50.5
max,119.0,1.0,59.5


In [11]:
# LETS CREAT A PART OF SPEECH DICTIONARY
POS_DIC={
    'noun':['NN','NNS','NNP','NNPS'],             # NN stands for singular noun
    'pron':['PRP','PRP$','WP','WP$'],             # NNS stands for plural noun
    'verb':['VB','VBD','VBG','VBN','VBP','VBZ'],  # NNP stands for singular proper noun
    'adj':['JJ','JJR','JJS'],                     # NNps stands for plural proper noun
    'adv':['RB','RBR','RBS','WRB']
}
#Create function to check and get part of speech tag count of a wards in given sentence
def pos_check(x,flag):
    cnt=0
    try:
        wiki=TextBlob(x)
        for tup in wiki.tags:
            ppo= list(tup)[1]
            if ppo in POS_DIC[flag]:
                cnt += 1
                
    except:
        pass
    return cnt

In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')
  
#calculate the count of nouns in the text
elon['nouns_cnt']=elon['Text'].apply(lambda x:pos_check(x,'noun'))


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


In [13]:
elon

Unnamed: 0.1,Unnamed: 0,Text,length,Words_count,Word_Density,nouns_cnt
0,1,kunalbImanalien,15,1,7.5,0
1,2,IDAACarmackRaytracingonCyberpunkwithHDRisnextl...,64,1,32.0,0
2,3,joeroganSpotifyGreatinterview,29,1,14.5,0
3,4,gteraDogeisunderestimated,25,1,12.5,0
4,5,teslacnCongratulationsTeslaChinaforamazingexec...,84,1,42.0,0
...,...,...,...,...,...,...
1994,1995,flcnhvyTrueitsoundssosurrealbutthenegativeprop...,110,1,55.0,0
1995,1996,PPatholeMakesuretoreadurtermsampconditionsbefo...,62,1,31.0,0
1996,1997,TeslaGongPPatholeSamwiseGamgee,30,1,15.0,0
1997,1998,PPatholeAlthoDumbandDumberisUFUF,32,1,16.0,0


In [14]:
# leats visualize the top 10 revivews after reamoving numbers and punctuations
elon['Text'].head(10)

0                                      kunalbImanalien
1    IDAACarmackRaytracingonCyberpunkwithHDRisnextl...
2                        joeroganSpotifyGreatinterview
3                            gteraDogeisunderestimated
4    teslacnCongratulationsTeslaChinaforamazingexec...
5                  HappyNewYearoftheOxhttpstcoWFKMYuoj
6    FrodowastheunderdogeAllthoughthewouldfailHimse...
7                    OwenSparksflcnhvyanonyxHahathanks
8    flcnhvyanonyxIndeedTweetsdefinitelydonotrepres...
9            Themostentertainingoutcomeisthemostlikely
Name: Text, dtype: object