In [65]:
import pandas as pd
import numpy as np
import spacy

In [66]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [67]:
df = pd.read_csv("twitter4000.csv",encoding="latin1")

In [68]:
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [69]:
df.sentiment.value_counts()

1    2000
0    2000
Name: sentiment, dtype: int64

### Word Counts
- len of words in a sentence

In [70]:
df['word_counts'] = df.twitts.apply(lambda x:len(x.split()))
df.head()

Unnamed: 0,twitts,sentiment,word_counts
0,is bored and wants to watch a movie any sugge...,0,10
1,back in miami. waiting to unboard ship,0,7
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12
3,ughhh i am so tired blahhhhhhhhh,0,6
4,@mandagoforth me bad! It's funny though. Zacha...,0,26


In [71]:
df['word_counts'].min()

1

In [72]:
df['word_counts'].max()

32

In [73]:
df[df.word_counts==1]

Unnamed: 0,twitts,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Character Counts

In [74]:
def char_counts(x):
    s = x.split()
    x = "".join(s)
    return len(x)

In [75]:
df["char_counts"] = df['twitts'].apply(lambda x:char_counts(x))

In [76]:
df.head() 

Unnamed: 0,twitts,sentiment,word_counts,char_counts
0,is bored and wants to watch a movie any sugge...,0,10,43
1,back in miami. waiting to unboard ship,0,7,32
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54
3,ughhh i am so tired blahhhhhhhhh,0,6,27
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116


### Average Word Length

In [77]:
x = "hello world" # 10/2 = 5; 10 total chars; 2 total words

In [78]:
df['average_word_length'] =  df['char_counts']/ df['word_counts']
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3
1,back in miami. waiting to unboard ship,0,7,32,4.571429
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538


### Stop words count
- Stop words occur frequently in corpus
- These words do not represent the context for the sentence
- we can remove stopwords.
- in sentiment analysis not is a important word.

In [79]:
print(stopwords) # imported from spacy

{'somehow', 'anyway', 'who', 'therefore', '‘re', "'ll", 'whereupon', 'were', 'sometimes', 'eight', 'mostly', 'few', 'hers', 'whereafter', 'which', 'be', 'top', "n't", 'not', 'myself', 'without', '‘ll', 'become', 'however', 'a', 'together', 'toward', 'always', 'get', 'due', 'throughout', 'nothing', 'around', 'hundred', "'d", 'towards', 'noone', 'her', 'eleven', 'three', 'used', 'except', 'below', 'every', 'as', '‘ve', 'made', 'part', 'whether', 'unless', 'already', 'nevertheless', 'becomes', 'nobody', 'off', 'have', 'two', 'had', 'namely', 'mine', 'where', 'each', 'move', 'go', 'seem', 'within', 'whoever', 'after', 'or', 'itself', 'she', 'my', 'whenever', 'yet', 'under', 'thereby', 'beyond', 'keep', 'cannot', 'but', 'ever', 'once', 'again', 'doing', 'besides', 'no', 'still', 'either', 'behind', 'amount', 'us', 'among', 'regarding', 'in', 'did', 'wherever', 're', 'through', 'someone', 'to', 'yourselves', 'another', 'may', 'it', 'others', 'upon', 'alone', 'everywhere', 'least', 'using', '

In [80]:
len(stopwords)

326

In [81]:
x = "this is the text data"
x.split()

['this', 'is', 'the', 'text', 'data']

In [82]:
len([i for i in x.split() if i in stopwords])

3

In [83]:
df['stop_words_len'] = df.twitts.apply(lambda x:len([i for i in x.split() if i in stopwords]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13


### Count # and @

In [84]:
x = 'this is a #hastag and this is mention'
x.split()

['this', 'is', 'a', '#hastag', 'and', 'this', 'is', 'mention']

In [85]:
df["hashtagcount"] = df.twitts.apply(lambda x:len([i for i in x.split() if i.startswith("#")]))
df["@count"] = df.twitts.apply(lambda x:len([i for i in x.split() if i.startswith("@")]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2


### Digit is present or not

In [86]:
df["Numeric_counts"] = df.twitts.apply(lambda x: len([i for i in x.split() if i.isdigit()]))

In [87]:
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2,0


### Upper Case Word Count

In [88]:
df["upper_words_count"] = df['twitts'].apply(lambda x:len([i for i in x.split() if i.isupper()]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2,0,0


### Preprocessing And Cleaning

### Lower case conversion

In [89]:
df['twitts'] = df['twitts'].apply(lambda x: str(x).lower())
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0,0
4,@mandagoforth me bad! it's funny though. zacha...,0,26,116,4.461538,13,0,2,0,0


### Contraction to expansion

In [90]:
contractions = { "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have",
# "'cause": "because",
"could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not",
"don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not",
"he'd": "he had","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is",
"how'd": "how did","how'd'y": "how do you","how'll": "how will","I'd": "I had","I'd've": "I would have",
"I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not","it'd": "it had",
"it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it has","let's": "let us",
"ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
"must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not",
"needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have",
"shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she would",
"she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
"should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have",
"so's": "so is","that'd": "that would","that'd've": "that would have","there'd've": "there would have",
"they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are",
"they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would","we'd've": "we would have",
"we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have","weren't": "were not",
"what'll": "what will","what'll've": "what will have","what're": "what are","what've": "what have",
"when've": "when have","where'd": "where did","where've": "where have","who'll": "who will",
"who'll've": "who will have","who's": "who is","who've": "who have","why've": "why have","will've": "will have",
"won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
"wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have",
"y'all're": "you all are","y'all've": "you all have","you'd": "you would","you'd've": "you would have",
"you'll": "you will","you'll've": "you will have","you're": "you are","you've": "you have"
}

In [91]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else:
        return x

In [92]:
cont_to_exp("I can't study")

'I cannot study'

In [93]:
%%timeit
df['twitts'] = df['twitts'].apply(lambda x : cont_to_exp(x))

128 ms ± 6.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [94]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count
1666,@winecountrydog yes,0,2,18,9.0,0,0,1,0,0
138,@shaaqt did to whoever was left! think @netra ...,0,16,75,4.6875,10,0,3,0,0
2475,@heritagesoftail kkkkkk i will go ask her,1,7,35,5.0,3,0,1,0,1
254,is sick damnit! y does my body hate me??? t-t,0,10,36,3.6,3,0,0,0,1
500,"@brandonleblanc yeah i know, my frustration is...",0,26,112,4.307692,14,0,1,0,3


### Remove Email

In [95]:
# df[df['twitts'].str.contains(".com")]

In [105]:
import re

In [106]:
x = "abc@gmail.com is abc"
re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", x)

['abc@gmail.com']

In [107]:
x = "abc@gmail.com is abc"
re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b)", x)

[]

In [111]:
df["emails"] = df['twitts'].apply(lambda x: re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", x))

In [112]:
df['email_count'] = df["emails"].apply(lambda x: len(x))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count,emails,email_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0,0,[],0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0,[],0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0,1,[],0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0,0,[],0
4,@mandagoforth me bad! it has funny though. zac...,0,26,116,4.461538,13,0,2,0,0,[],0


In [113]:
df[df.email_count>0]

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count,emails,email_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0,[markbradbury_16@hotmail.com],1


In [116]:
x = "abc@gmail.com is abc"
re.sub("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)","" ,x)

' is abc'

In [117]:
df['twitts'] = df['twitts'].apply(lambda x: re.sub("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)","" ,x) )

In [118]:
df.iloc[3713]['twitts']

'@securerecs arghh me please  '