# Pandas text processing

In [21]:
import pandas as pd
import numpy as np
import re # Standard regex library

data = pd.read_csv('train.csv', encoding = "ISO-8859-1")
pd.options.display.max_colwidth = 1000
display(data.head())
data.describe()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! T_T


Unnamed: 0,ItemID,Sentiment
count,99989.0,99989.0
mean,50005.110042,0.564632
std,28865.894393,0.495808
min,1.0,0.0
25%,25009.0,0.0
50%,50006.0,1.0
75%,75003.0,1.0
max,100000.0,1.0


# String methods
### pandas.Series.str methods.
#### With these methods you can manipulate the text of each row in a column.

### Strip whitespaces and and newlines:

In [22]:
data['SentimentText'] = data['SentimentText'].str.strip()
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! T_T


### Make all text upper-case or lower-case:

In [23]:
data['SentimentText'] = data['SentimentText'].str.upper()
display(data.head(2))

data['SentimentText'] = data['SentimentText'].str.lower()
data.head(2)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,IS SO SAD FOR MY APL FRIEND.............
1,2,0,I MISSED THE NEW MOON TRAILER...


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
1,2,0,i missed the new moon trailer...


### Find the length of the string of each row:

In [24]:
lengths = data['SentimentText'].str.len()
lengths[:3]

0    40
1    32
2    23
Name: SentimentText, dtype: int64

### Split text by a character, in this case by spaces. Returns an array of individual words:

In [25]:
data_split = data.copy()
data_split['SentimentText'] = data['SentimentText'].str.split(' ')
data_split.head(2)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,"[is, so, sad, for, my, apl, friend.............]"
1,2,0,"[i, missed, the, new, moon, trailer...]"


### Replace all occurrences of a substring with a new one. ('') can be used to remove the substring:

In [30]:
data_replaced = data.copy()
data_replaced['SentimentText'] = data['SentimentText'].str.replace('is ', '')
data_replaced.head(2)

data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
1,2,0,i missed the new moon trailer...
2,3,1,omg its already 7:30 :o
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! t_t


#### Return boolean array if each string contains pattern/regex:

In [7]:
data_copy1 = data.copy()
data_copy2 = data.copy()
booleanArray = data['SentimentText'].str.contains('is ')
print("Boolean Array:\n", booleanArray[:4])

# The boolean array can also be used as a filter on the data frame

# Removes all rows where the boolean array is False:
data_copy1 = data_copy1[booleanArray]
display(data_copy1.head())

# (~) can be used to reverse the array. Here all rows where the boolean array is True are removed:
data_copy2 = data_copy2[~booleanArray]
data_copy2.head()

Boolean Array:
 0     True
1    False
2    False
3     True
Name: SentimentText, dtype: bool


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! t_t
12,13,0,this weekend has sucked so far
15,16,0,&lt;-------- this is the way i feel right now...


Unnamed: 0,ItemID,Sentiment,SentimentText
1,2,0,i missed the new moon trailer...
2,3,1,omg its already 7:30 :o
5,6,0,or i just worry too much?
6,7,1,juuuuuuuuuuuuuuuuussssst chillin!!
7,8,0,sunny again work tomorrow :-| tv tonight


### These methods can also be chained together:

In [8]:
boolean_array = data['SentimentText'].str.strip().str.lower().str.contains('me')
boolean_array[:5]

0    False
1    False
2    False
3    False
4     True
Name: SentimentText, dtype: bool

# Data frame methods

### Similar methods as the ones showed above can also be used directly on the dataframe:

In [8]:
data['SentimentText'].replace(regex = True, inplace = True, to_replace=r'omg',value='I cant believe it')
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
1,2,0,i missed the new moon trailer...
2,3,1,I cant believe it its already 7:30 :o
3,4,0,.. I cant believe itaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! t_t


#### You can covert a dataframe to numpy with values:

In [10]:
numpy_data = data.values
print(numpy_data)

# Or just one column
print("\nSentimentText column:")
numpy_data = data['SentimentText'].values
numpy_data

[[1 0 'is so sad for my apl friend.............']
 [2 0 'i missed the new moon trailer...']
 [3 1 'I cant believe it its already 7:30 :o']
 ...
 [99998 0 '@cupcake_2120 ya i thought so']
 [99999 1 "@cupcake_dollie yes. yes. i'm glad you had more fun with me."]
 [100000 1 '@cupcake_kayla haha yes you do']]

SentimentText column:


array(['is so sad for my apl friend.............',
       'i missed the new moon trailer...',
       'I cant believe it its already 7:30 :o', ...,
       '@cupcake_2120 ya i thought so',
       "@cupcake_dollie yes. yes. i'm glad you had more fun with me.",
       '@cupcake_kayla haha yes you do'], dtype=object)

# How to use regex with Pandas

### You can use regex on methods such as replace() and contains() by setting regex = True:

In [31]:
# On Series.str, can also 
data_replaced = data.copy()
data_replaced['SentimentText'] = data['SentimentText'].str.replace(r'[0-9]', 'NUMBER_OCCURRENCE', regex = True)
display(data_replaced.head(5))

# On data frame
boolean_array = data['SentimentText'].str.contains(r'[0-9]', regex = True)
boolean_array[:5]

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
1,2,0,i missed the new moon trailer...
2,3,1,omg its already NUMBER_OCCURRENCE:NUMBER_OCCURRENCENUMBER_OCCURRENCE :o
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since NUMBER_OCCURRENCENUMBER_OCCURRENCE.. i was suposed NUMBER_OCCURRENCE just get a crown put on (NUMBER_OCCURRENCENUMBER_OCCURRENCEmins)...
4,5,0,i think mi bf is cheating on me!!! t_t


0    False
1    False
2     True
3     True
4    False
Name: SentimentText, dtype: bool

# Basic regex

https://www.machinelearningplus.com/python/python-regex-tutorial-examples/

https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial

# Functions and lambda functions in Pandas

### You can use functions or lambda functions on every row in a data frame with the apply() method:

In [33]:
data_from_function = data.copy()
data_from_lambda = data.copy()

# With function
def func(text):
    return text + 'replaced text function'
    
data_from_function['SentimentText'] = data['SentimentText'].apply(func)
display(data_from_function.head())

# With lambda function
data_from_lambda['SentimentText'] = data['SentimentText'].apply(lambda text: text + 'replaced text lambda')
data_from_lambda.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............replaced text function
1,2,0,i missed the new moon trailer...replaced text function
2,3,1,omg its already 7:30 :oreplaced text function
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...replaced text function
4,5,0,i think mi bf is cheating on me!!! t_treplaced text function


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,replaced text lambda
1,2,0,replaced text lambda
2,3,1,replaced text lambda
3,4,0,replaced text lambda
4,5,0,replaced text lambda


### Three different ways to find all rows where the first word is a hastag:

In [13]:
# With str.contains()
data_with_hashtags = data.copy()
boolean_array = data['SentimentText'].str.contains(r'^[#].*$', regex = True)
data_with_hashtags = data_with_hashtags[boolean_array]
display(data_with_hashtags.head(2))

def find_all_rows_with_hastag(x):
    if x.startswith('#'):
        return x
    
data_with_hashtags['SentimentText'] = data['SentimentText'].apply(find_all_rows_with_hastag)
display(data_with_hashtags.head(2))  

# With lambda 
data_with_hashtags['SentimentText'] = data['SentimentText'].apply(lambda x: x if x.startswith('#') else None)
data_with_hashtags.head(2)

Unnamed: 0,ItemID,Sentiment,SentimentText
131,132,0,#3turnoffwords this shit sucks
132,133,0,#asylm j2 panel is over. guess it's back to normal life.


Unnamed: 0,ItemID,Sentiment,SentimentText
131,132,0,#3turnoffwords this shit sucks
132,133,0,#asylm j2 panel is over. guess it's back to normal life.


Unnamed: 0,ItemID,Sentiment,SentimentText
131,132,0,#3turnoffwords this shit sucks
132,133,0,#asylm j2 panel is over. guess it's back to normal life.


### Check with list of words:

In [14]:
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend.............
1,2,0,i missed the new moon trailer...
2,3,1,I cant believe it its already 7:30 :o
3,4,0,.. I cant believe itaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...
4,5,0,i think mi bf is cheating on me!!! t_t


In [35]:
data_with_words_removed = data.copy()

words = ['is', 'for', 'the', 'it', 'a']

# With function
def func_yo(text):
    
    word_seq = []   
    for x in text.split():
        if x not in words:
            word_seq.append(x)
    return ' '.join(word_seq)

data_with_words_removed['SentimentText'] = data['SentimentText'].apply(func_yo)
display(data_with_words_removed.head())

# With lambda
data_with_words_removed['SentimentText'] = data['SentimentText'].apply(lambda x: ' '.join(x for x in x.split() if x not in words))
data_with_words_removed.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,so sad my apl friend.............
1,2,0,i missed new moon trailer...
2,3,1,omg its already 7:30 :o
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get crown put on (30mins)...
4,5,0,i think mi bf cheating on me!!! t_t


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,so sad my apl friend.............
1,2,0,i missed new moon trailer...
2,3,1,omg its already 7:30 :o
3,4,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get crown put on (30mins)...
4,5,0,i think mi bf cheating on me!!! t_t


In [16]:

words = ['is', 'for', 'the', 'it', 'a']
words = set(words) # Make the words list into a set for a faster search

# With function
def func_yo(text):
    
    word_seq = []   
    for x in text.split():
        if x in words: # This is O(n) rather than O(n^2) because words is a set
            word_seq.append(x)
            word_seq.append('heio')
            
    return 'abc'.join(word_seq)

data_with_words_removed['SentimentText'] = data['SentimentText'].apply(func_yo)
display(data_with_words_removed.head())

# With lambda
data_with_words_removed['SentimentText'] = data['SentimentText'].apply(lambda x: 'abc'.join(x for x in x.split() if x in words))
data_with_words_removed.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,isabcheioabcforabcheio
1,2,0,theabcheio
2,3,1,itabcheio
3,4,0,aabcheio
4,5,0,isabcheio


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,isabcfor
1,2,0,the
2,3,1,it
3,4,0,a
4,5,0,is
