## Text Processing for NLP

In [1]:
import pandas as pd 
import numpy as np 
import spacy

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [3]:
df = pd.read_csv(r"D:\Deep Learning\NLP\twitter4000.csv")

In [4]:
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


Positive = 1 
Negative  = 0

In [5]:
df.shape

(4000, 2)

In [6]:
df["sentiment"].value_counts()

1    2000
0    2000
Name: sentiment, dtype: int64

## Word and Character Count

In [7]:
df["word_count"] = df["twitts"].apply(lambda x: len(str(x).split()))

In [8]:
df.head()

Unnamed: 0,twitts,sentiment,word_count
0,is bored and wants to watch a movie any sugge...,0,10
1,back in miami. waiting to unboard ship,0,7
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12
3,ughhh i am so tired blahhhhhhhhh,0,6
4,@mandagoforth me bad! It's funny though. Zacha...,0,26


In [9]:
df["word_count"].max(),df["word_count"].min()

(32, 1)

 Only 1 Word 

In [10]:
df[df["word_count"]==1]

Unnamed: 0,twitts,sentiment,word_count
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


In [11]:
df["char_counts"] = df["twitts"].apply(lambda x:len(x))

In [12]:
df.head()

Unnamed: 0,twitts,sentiment,word_count,char_counts
0,is bored and wants to watch a movie any sugge...,0,10,53
1,back in miami. waiting to unboard ship,0,7,40
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,66
3,ughhh i am so tired blahhhhhhhhh,0,6,33
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,142


## Average Word Length Per Tweet

In [13]:
df['Avg_Word_Len'] = df["char_counts"]/df["word_count"]

In [14]:
df.head()

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3
1,back in miami. waiting to unboard ship,0,7,40,5.714286
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,66,5.5
3,ughhh i am so tired blahhhhhhhhh,0,6,33,5.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,142,5.461538


## Stop Words Count

In [15]:
print(stopwords)
len(stopwords)

{'everyone', 'did', 'due', 'first', 'although', 'among', 'only', 'serious', 'beside', 'seem', 'sixty', 'from', 'became', 'through', 'latter', 'without', 'put', 'amongst', 'is', 'say', 'besides', 'must', 'against', 'keep', 'should', 'any', 'they', 'least', 'top', 'twelve', 'whom', 'mostly', 'do', 'within', 'anyhow', 'why', 'his', 'had', 'in', 'make', 'fifteen', 'herein', 'it', 'wherever', 'been', 'per', 'as', 'into', 'whither', 'which', 'many', "'ve", 'someone', 'bottom', 'see', 'well', 'but', 'because', 'doing', 'twenty', 'while', '‘d', 'out', 'here', 'eleven', 'there', 'becoming', 'already', 'those', 'via', 'hers', 'when', 'around', 'about', 're', 'beforehand', 'often', 'formerly', 'side', 'on', 'during', 'cannot', 'up', 'you', 'us', 'itself', 'at', 'much', 'your', 'were', 'seems', 'quite', '’d', 'perhaps', 'regarding', 'meanwhile', '‘ll', 'him', 'except', 'whatever', 'two', 'empty', 'none', 'whereafter', 'move', 'either', 'very', 'towards', 'fifty', 'upon', 'always', 'now', "'s", '‘v

326

In [16]:
def stopWord(s):
    count = 0
    for i in s.split():
        if i in stopwords:
            count+=1
    
    return count


df["stopword_count"] = df["twitts"].apply(lambda x:stopWord(x))

In [17]:
df.head()

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len,stopword_count
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3,5
1,back in miami. waiting to unboard ship,0,7,40,5.714286,3
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,66,5.5,0
3,ughhh i am so tired blahhhhhhhhh,0,6,33,5.5,3
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,142,5.461538,13


## Count #Hashtags and @Mentions

In [18]:
df["hashtag_count"] = df["twitts"].apply(lambda x:len([t for t in x.split() if t.startswith('#')]))
df["mention_count"] = df["twitts"].apply(lambda x:len([t for t in x.split() if t.startswith('@')]))

In [20]:
df.head()

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len,stopword_count,hashtag_count,mention_count
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3,5,0,0
1,back in miami. waiting to unboard ship,0,7,40,5.714286,3,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,66,5.5,0,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,33,5.5,3,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,142,5.461538,13,0,2


# Numeric Digits Present or not in Tweet

In [28]:
df["num_counts"]=df['twitts'].apply(lambda x: len([t for t in x.split() if t.isdigit()]))

In [29]:
df.head()

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len,stopword_count,hashtag_count,mention_count,num_counts
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3,5,0,0,0
1,back in miami. waiting to unboard ship,0,7,40,5.714286,3,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,66,5.5,0,0,1,0
3,ughhh i am so tired blahhhhhhhhh,0,6,33,5.5,3,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,142,5.461538,13,0,2,0


# Upper Case Words Count

Useful when expressing emotions.

In [32]:
df["upper_counts"]=df['twitts'].apply(lambda x: len([t for t in x.split() if t.isupper()]))

In [33]:
df.head(2)

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len,stopword_count,hashtag_count,mention_count,num_counts,upper_counts
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,40,5.714286,3,0,0,0,0


# Lower Case Conversion

In [34]:
df['twitts'] = df['twitts'].apply(lambda x:str(x).lower())

In [35]:
df.head(1)

Unnamed: 0,twitts,sentiment,word_count,char_counts,Avg_Word_Len,stopword_count,hashtag_count,mention_count,num_counts,upper_counts
0,is bored and wants to watch a movie any sugge...,0,10,53,5.3,5,0,0,0,0
