# Text Preprocessing in NLP
Tokenize Text Columns Into Sentences

In [1]:
# Required libraries

# [pip install spacy](https://pypi.org/project/spacy/)

# Input the following into gitbash: "python -m spacy download en_core_web_sm"
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !conda install -c anaconda nltk

In [2]:
# Import Dependencies and setup
import pandas as pd
import spacy
import nltk
import os

In [3]:
# read csv output from Instagrapy_split_text.ipynb
df=pd.read_csv("../../resources/ig_datascrape_jc_2021-08-25.csv", encoding="ISO 8859-1")
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2
0,0,0,shmee150,CSzoxcyrzj2,1629485286,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],..."
1,1,1,shmee150,CSr2jQPjy59,1629224075,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri..."


In [4]:
## Data Cleaning Steps

In [5]:
# convert epoch time to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')

# force change, of specified column type, to string.
df.text = df.text.astype('string')
df.caption = df.caption.astype('string')
df.Hash_tag2 = df.Hash_tag2.astype('string')

df.dtypes  # verify string change

Unnamed: 0               int64
Unnamed: 0.1             int64
author                  object
shortcode               object
timestamp       datetime64[ns]
likes                    int64
comments                 int64
caption                 string
text                    string
Hash_tag2               string
dtype: object

### Punctuation Removal
** Currently causing issues with sentence segmentation

In [6]:
# library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

The following script removes "@". Do we need to modify the script to keep it? If so, we will have to use Regex to more finely tune the punctuation removal.

In [7]:
# defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

# storing the puntuation free text in a new column
df['clean_txt']= df['text'].apply(lambda x: [remove_punctuation(str(x))])
df.clean_txt = df.clean_txt.astype('string')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt
0,0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...
1,1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...
2,2,2,shmee150,CSpWzdxJIIV,2021-08-16 18:58:41,21606,130,"Photo shared by Tim - Shmee on August 16, 2021...",The beautiful 300SL Roadster is without a shad...,"[['Mercedes'], ['300SL'], ['PebbleBeach'], ['C...",['The beautiful 300SL Roadster is without a sh...
3,3,3,shmee150,CSkdxyJAk2n,2021-08-14 21:23:26,30069,113,"Photo shared by Tim - Shmee on August 14, 2021...",The breathtaking @bugatti Bolide at @thequaile...,"[['Bugatti'], ['Bolide'], ['Quail'], ['CarWeek...",['The breathtaking bugatti Bolide at thequaile...
4,4,4,shmee150,CSfHSEzi3BD,2021-08-12 19:30:39,34073,140,"Photo shared by Tim - Shmee on August 12, 2021...",The new @astonmartinlagonda Valkyrie Spider ha...,"[['AstonMartin'], ['Valkyrie'], ['ValkyrieSpid...",['The new astonmartinlagonda Valkyrie Spider h...


In [8]:
# drop Unnamed columns
df =df.drop(['Unnamed: 0.1'], axis=1)
df = df.reset_index(drop=True)

# verify above scripts work. assign first_text to first row's "txt_lower" column
# all punctuations now removed, and words in lower case
ig_text = df.loc[0, "clean_txt"]   ##** Punc_changes. changed from text to "clean_text"
print(ig_text)

['Back at the wheel of an SF90 With bannedauto and philwilson Im checking out this stunning Assetto Fiorano car and thinking about the final spec Ill opt for mine which actually needs to be locked next month Im also delighted to say that again the car has impressed me I think its one of the very best supercars currently on the market mixing insane performance with new technology in such a seamless way Needless to say Im quite excited about it Ferrari SF90 futureshmeemobile AssettoFiorano BannedAuto LAcars Shmee150']


In [9]:
# verify "Unammed 0.1" was dropped
df.head(2)

Unnamed: 0.1,Unnamed: 0,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt
0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...
1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...


### Tokenization

Resources to better understand text preprocessing
<br>
[Tokenize Text Columns Into Sentences in Pandas](https://towardsdatascience.com/tokenize-text-columns-into-sentences-in-pandas-2c08bc1ca790)
<br>
Note that v3 of spacy replaces "nlp.create_pipe", with "nlp.add_pipe('sentencizer')"

Sentencizer is a pipeline component for rules-based sentence segmentation
[spacy boundry detection](https://spacy.io/api/sentencizer). Customization option includes creation of custom list of punctuation characters that mark sentence ends.

In [10]:
# Tokenize using spaCy
nlp = spacy.load("en_core_web_sm")
[sent.text for sent in nlp(ig_text).sents]

["['Back at the wheel of an SF90 With bannedauto and philwilson Im checking out this stunning Assetto Fiorano car and thinking about the final spec Ill opt for mine which actually needs to be locked next month Im also delighted to say that again the car has impressed me I think its one of the very best supercars currently on the market mixing insane performance with new technology in such a seamless way Needless to say Im quite excited about it Ferrari SF90 futureshmeemobile AssettoFiorano BannedAuto LAcars Shmee150']"]

In [11]:
from spacy.lang.en import English

nlp = English()  # just the language with no model
sentencizer = nlp.add_pipe('sentencizer')

In [12]:
[sent.text for sent in nlp(ig_text).sents]

["['Back at the wheel of an SF90 With bannedauto and philwilson Im checking out this stunning Assetto Fiorano car and thinking about the final spec Ill opt for mine which actually needs to be locked next month Im also delighted to say that again the car has impressed me I think its one of the very best supercars currently on the market mixing insane performance with new technology in such a seamless way Needless to say Im quite excited about it Ferrari SF90 futureshmeemobile AssettoFiorano BannedAuto LAcars Shmee150']"]

In [13]:
# tokenize all data, in column "text", using lambda function
# this was a pain. some elements were ints or floats, causing mixed returns of a dtype 
# object type. This stopped the script from filtering it out, returning a "nlp object 
# of type 'float' has no len()". the workaround is to turn everything into a string

nlp = spacy.load("en_core_web_sm")
df["token_txt"] = df["clean_txt"].apply(lambda x: [sent.text for sent in (nlp(str(x)).sents)])

##** Punc_changes. changed from text to "clean_text"

In [14]:
# convert list of sentences to one sentence for each row

df = df.explode("token_txt")
df.reset_index(drop=True)
df.head(2)


Unnamed: 0.1,Unnamed: 0,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt,token_txt
0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...,['Back at the wheel of an SF90 With bannedauto...
1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...,['Its a P1 kinda day Out for a drive in superc...


In [15]:
df["tokenized"] = df["clean_txt"].apply(lambda x: x.split())
df.head()

Unnamed: 0.1,Unnamed: 0,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt,token_txt,tokenized
0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...,['Back at the wheel of an SF90 With bannedauto...,"[['Back, at, the, wheel, of, an, SF90, With, b..."
1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...,['Its a P1 kinda day Out for a drive in superc...,"[['Its, a, P1, kinda, day, Out, for, a, drive,..."
2,2,shmee150,CSpWzdxJIIV,2021-08-16 18:58:41,21606,130,"Photo shared by Tim - Shmee on August 16, 2021...",The beautiful 300SL Roadster is without a shad...,"[['Mercedes'], ['300SL'], ['PebbleBeach'], ['C...",['The beautiful 300SL Roadster is without a sh...,['The beautiful 300SL Roadster is without a sh...,"[['The, beautiful, 300SL, Roadster, is, withou..."
3,3,shmee150,CSkdxyJAk2n,2021-08-14 21:23:26,30069,113,"Photo shared by Tim - Shmee on August 14, 2021...",The breathtaking @bugatti Bolide at @thequaile...,"[['Bugatti'], ['Bolide'], ['Quail'], ['CarWeek...",['The breathtaking bugatti Bolide at thequaile...,['The breathtaking bugatti Bolide at thequaile...,"[['The, breathtaking, bugatti, Bolide, at, the..."
4,4,shmee150,CSfHSEzi3BD,2021-08-12 19:30:39,34073,140,"Photo shared by Tim - Shmee on August 12, 2021...",The new @astonmartinlagonda Valkyrie Spider ha...,"[['AstonMartin'], ['Valkyrie'], ['ValkyrieSpid...",['The new astonmartinlagonda Valkyrie Spider h...,['The new astonmartinlagonda Valkyrie Spider h...,"[['The, new, astonmartinlagonda, Valkyrie, Spi..."


### Stemming

In [16]:
# from nltk.stem.porter import *
# stemmer = PorterStemmer()

# df["tokenized"] = df["tokenized"].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
df["tokenized"] = df["tokenized"].apply(lambda x: [wordnet_lemmatizer.lemmatize(i) for i in x]) # stemming


df["tokenized"].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jchan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [['Back, at, the, wheel, of, an, SF90, With, b...
1    [['Its, a, P1, kinda, day, Out, for, a, drive,...
2    [['The, beautiful, 300SL, Roadster, is, withou...
3    [['The, breathtaking, bugatti, Bolide, at, the...
4    [['The, new, astonmartinlagonda, Valkyrie, Spi...
Name: tokenized, dtype: object

In [17]:
# df= df.explode("tokenized")
# df.reset_index(drop=True)
# df.head()

In [18]:
# #remove the square bracket
# df['tokenized']= df['tokenized'].str.strip('[]').astype(str)
# df.head()
# df2=df

In [19]:
# gbdf=df2.groupby('author').agg({'tokenized': lambda x: ' '.join(x)})
# gbdf['tokenized'][0]

In [20]:
# gbdf["tokenized"] = gbdf["tokenized"].apply(lambda x: x.split())
# gbdf.head()

# # from nltk.corpus import stopwords

# # nltk.download("stopwords")

# # stopwords_ = set(stopwords.words("english"))

# # clean_tokens = [t for t in gbdf["tokenized"] if not t in stopwords_]
# # # clean_text = " ".join(clean_tokens)
# # # print_text(clean_text)
# # clean_tokens

### Lowercase Text Manipulation

In [21]:
# storing all lower case text in a new column, "txt_lower". Note this leads to loss of
# information that a capital letter may convey, e.g. frustration or excitement.
# df['txt_lower']= df['clean_txt'].apply(lambda x: x.lower())

### Column Name Clean Up

In [22]:
df.rename(columns={"Unnamed: 0": "Dialogue ID"}, inplace=True)
df.index.name = "Sentence ID"

df.head(2)

Unnamed: 0_level_0,Dialogue ID,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt,token_txt,tokenized
Sentence ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...,['Back at the wheel of an SF90 With bannedauto...,"[['Back, at, the, wheel, of, an, SF90, With, b..."
1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...,['Its a P1 kinda day Out for a drive in superc...,"[['Its, a, P1, kinda, day, Out, for, a, drive,..."


In [23]:
df.to_csv("../../resources/processed_ig_text_jc_2021-08-26.csv")

Need to remove "," , "-", "@", "#",  convert conjugations into full words, e.g. isn't.

### Sentiment Analyzer

In [24]:
df.reset_index(inplace=True)

In [25]:
ig_posts = df['text']

In [26]:
df

Unnamed: 0,Sentence ID,Dialogue ID,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt,token_txt,tokenized
0,0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...,['Back at the wheel of an SF90 With bannedauto...,"[['Back, at, the, wheel, of, an, SF90, With, b..."
1,1,1,shmee150,CSr2jQPjy59,2021-08-17 18:14:35,22143,100,"Photo shared by Tim - Shmee on August 17, 2021...",It's a P1 kinda day! Out for a drive in @super...,"[['McLaren'], ['P1'], ['McLarenP1'], ['testdri...",['Its a P1 kinda day Out for a drive in superc...,['Its a P1 kinda day Out for a drive in superc...,"[['Its, a, P1, kinda, day, Out, for, a, drive,..."
2,2,2,shmee150,CSpWzdxJIIV,2021-08-16 18:58:41,21606,130,"Photo shared by Tim - Shmee on August 16, 2021...",The beautiful 300SL Roadster is without a shad...,"[['Mercedes'], ['300SL'], ['PebbleBeach'], ['C...",['The beautiful 300SL Roadster is without a sh...,['The beautiful 300SL Roadster is without a sh...,"[['The, beautiful, 300SL, Roadster, is, withou..."
3,3,3,shmee150,CSkdxyJAk2n,2021-08-14 21:23:26,30069,113,"Photo shared by Tim - Shmee on August 14, 2021...",The breathtaking @bugatti Bolide at @thequaile...,"[['Bugatti'], ['Bolide'], ['Quail'], ['CarWeek...",['The breathtaking bugatti Bolide at thequaile...,['The breathtaking bugatti Bolide at thequaile...,"[['The, breathtaking, bugatti, Bolide, at, the..."
4,4,4,shmee150,CSfHSEzi3BD,2021-08-12 19:30:39,34073,140,"Photo shared by Tim - Shmee on August 12, 2021...",The new @astonmartinlagonda Valkyrie Spider ha...,"[['AstonMartin'], ['Valkyrie'], ['ValkyrieSpid...",['The new astonmartinlagonda Valkyrie Spider h...,['The new astonmartinlagonda Valkyrie Spider h...,"[['The, new, astonmartinlagonda, Valkyrie, Spi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,108,108,garyvee,CS2ivLCMy5V,2021-08-21 21:53:05,24615,850,"Photo by Gary Vay-Ner-Chuk on August 21, 2021....",DonÂt get it twisted Â this is gonna be Âth...,[],['DonÂ\x92t get it twisted Â\x85 this is gonna...,['DonÂ\x92t get it twisted Â\x85,"[['DonÂ\x92t, get, it, twisted, Â\x85, this, i..."
148,108,108,garyvee,CS2ivLCMy5V,2021-08-21 21:53:05,24615,850,"Photo by Gary Vay-Ner-Chuk on August 21, 2021....",DonÂt get it twisted Â this is gonna be Âth...,[],['DonÂ\x92t get it twisted Â\x85 this is gonna...,this is gonna be Â\x93the bookÂ\x94 Â\x85 big ...,"[['DonÂ\x92t, get, it, twisted, Â\x85, this, i..."
149,109,109,garyvee,CS17uAQALn9,2021-08-21 16:14:07,29599,573,,GRATITUDE FOR WHAT YOU HAVE Versus COMPLAINING...,[],['GRATITUDE FOR WHAT YOU HAVE Versus COMPLAINI...,['GRATITUDE FOR WHAT YOU HAVE Versus COMPLAINI...,"[['GRATITUDE, FOR, WHAT, YOU, HAVE, Versus, CO..."
150,110,110,garyvee,CS0KgsvgmDM,2021-08-20 23:43:48,42001,964,,"?? YouÂve asked for it, you got it Â another...",[['OverRatedUnderRated']],[' YouÂ\x92ve asked for it you got it Â\x85 an...,[' YouÂ\x92ve asked for it you got it Â\x85 an...,"[[', YouÂ\x92ve, asked, for, it, you, got, it,..."


In [27]:
ig_posts.values[2]

"The beautiful 300SL Roadster is without a shadow of a doubt my dream classic car. Driving this particular example that's for sale with @mbclassiccenter, thanks to @mercedesbenzmuseum, while car spotting countless hypercars around Pebble Beach could not be better! Maybe one day for @theshmeemobiles, maybe one day... #Mercedes #300SL #PebbleBeach #CarWeek #testdrive #Shmee150"

In [28]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jchan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [29]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [30]:
sia.polarity_scores('Dartanion is the greatest in the world!')

{'neg': 0.0, 'neu': 0.572, 'pos': 0.428, 'compound': 0.6696}

In [31]:
df['neg'] = None
df['neu'] = None
df['pos'] = None
df['compound'] = None

for k, v in df.iterrows():
    post = v['text']
    
    try:
        pol_scores = sia.polarity_scores(post)
    except:
        print(f'Error with record {k}. Moving on...')
    
    df.loc[k,'neg'] = pol_scores['neg']
    df.loc[k,'neu'] = pol_scores['neu']
    df.loc[k,'pos'] = pol_scores['pos']
    df.loc[k,'compound'] = pol_scores['compound']

Error with record 19. Moving on...


In [32]:
df.head(1)

Unnamed: 0,Sentence ID,Dialogue ID,author,shortcode,timestamp,likes,comments,caption,text,Hash_tag2,clean_txt,token_txt,tokenized,neg,neu,pos,compound
0,0,0,shmee150,CSzoxcyrzj2,2021-08-20 18:48:06,19080,49,"Photo shared by Tim - Shmee on August 20, 2021...",Back at the wheel of an SF90! With @bannedauto...,"[['Ferrari'], ['SF90'], ['futureshmeemobile'],...",['Back at the wheel of an SF90 With bannedauto...,['Back at the wheel of an SF90 With bannedauto...,"[['Back, at, the, wheel, of, an, SF90, With, b...",0.027,0.801,0.172,0.9379


In [40]:

df.Hash_tag2 = df.Hash_tag2.astype('object')
df.dtypes


Sentence ID             int64
Dialogue ID             int64
author                 object
shortcode              object
timestamp      datetime64[ns]
likes                   int64
comments                int64
caption                string
text                   string
Hash_tag2              object
clean_txt              string
token_txt              object
tokenized              object
neg                    object
neu                    object
pos                    object
compound               object
dtype: object

In [45]:
# ig_perf_summary = new_df.groupby('Author')['Likes', 'Comments', 'Neg', 'neu', 'Pos', 'Compound'].sum().reset_index()
ig_hashtag_summary = df.groupby('author')['Hash_tag2'].sum()

ig_hashtag_summary.head(25)

ig_hashtag_summary.to_csv("../../resources/ig_hashtag_summary_jc_2021-08-26.csv")


In [46]:
df.to_csv("../../resources/sentiment_jc_2021-08-26.csv")