In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [3]:
df = pd.read_csv("twitter4000.csv",encoding="latin1")

In [4]:
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [5]:
df.sentiment.value_counts()

1    2000
0    2000
Name: sentiment, dtype: int64

### Word Counts
- len of words in a sentence

In [6]:
df['word_counts'] = df.twitts.apply(lambda x:len(x.split()))
df.head()

Unnamed: 0,twitts,sentiment,word_counts
0,is bored and wants to watch a movie any sugge...,0,10
1,back in miami. waiting to unboard ship,0,7
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12
3,ughhh i am so tired blahhhhhhhhh,0,6
4,@mandagoforth me bad! It's funny though. Zacha...,0,26


In [7]:
df['word_counts'].min()

1

In [8]:
df['word_counts'].max()

32

In [9]:
df[df.word_counts==1]

Unnamed: 0,twitts,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Character Counts

In [10]:
def char_counts(x):
    s = x.split()
    x = "".join(s)
    return len(x)

In [11]:
df["char_counts"] = df['twitts'].apply(lambda x:char_counts(x))

In [12]:
df.head() 

Unnamed: 0,twitts,sentiment,word_counts,char_counts
0,is bored and wants to watch a movie any sugge...,0,10,43
1,back in miami. waiting to unboard ship,0,7,32
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54
3,ughhh i am so tired blahhhhhhhhh,0,6,27
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116


### Average Word Length

In [13]:
x = "hello world" # 10/2 = 5; 10 total chars; 2 total words

In [14]:
df['average_word_length'] =  df['char_counts']/ df['word_counts']
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3
1,back in miami. waiting to unboard ship,0,7,32,4.571429
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538


### Stop words count
- Stop words occur frequently in corpus
- These words do not represent the context for the sentence
- we can remove stopwords.
- in sentiment analysis not is a important word.

In [15]:
print(stopwords) # imported from spacy

{'but', 'sixty', 'anyway', 'along', 'than', 'please', 'first', 'become', 'will', 'show', 'two', 'hereby', 'am', 'had', 'been', 'others', 'around', 'and', 'beyond', 'herself', 'via', 'none', 'even', 'just', 'see', 'meanwhile', '‘re', 'only', 'five', 'less', 'one', 'moreover', 'whither', 'many', 'over', 'six', 'regarding', 'otherwise', 'whose', '’s', 'ever', 'when', 'yours', 'really', 'in', 'above', 'another', 'fifteen', 'hers', 'forty', 'these', 'amongst', 'whereupon', 'other', 'thereby', 'empty', 'hereafter', 'except', 'about', 'most', 'various', 'much', "'d", 'whenever', 'from', 'hence', 'say', 'then', 'everyone', 'put', 'nothing', 'too', 'least', 'latter', 'is', 'where', 'yourself', 'n’t', 'nine', 'your', 'into', 'a', 'were', 'someone', 'nevertheless', 'on', 'through', 'has', 'if', 'upon', 'same', 'at', 'also', 'may', 'themselves', 'they', 'eleven', 'front', 'behind', 'doing', 'we', 'almost', 'always', 'move', 'might', 'twenty', 'per', 'across', 'between', 'twelve', 'whatever', 'beca

In [16]:
len(stopwords)

326

In [17]:
x = "this is the text data"
x.split()

['this', 'is', 'the', 'text', 'data']

In [18]:
len([i for i in x.split() if i in stopwords])

3

In [19]:
df['stop_words_len'] = df.twitts.apply(lambda x:len([i for i in x.split() if i in stopwords]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13


### Count # and @

In [20]:
x = 'this is a #hastag and this is mention'
x.split()

['this', 'is', 'a', '#hastag', 'and', 'this', 'is', 'mention']

In [21]:
df["hashtagcount"] = df.twitts.apply(lambda x:len([i for i in x.split() if i.startswith("#")]))
df["@count"] = df.twitts.apply(lambda x:len([i for i in x.split() if i.startswith("@")]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2


### Digit is present or not

In [22]:
df["Numeric_counts"] = df.twitts.apply(lambda x: len([i for i in x.split() if i.isdigit()]))

In [23]:
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2,0


### Upper Case Word Count

In [25]:
df["upper_words_count"] = df['twitts'].apply(lambda x:len([i for i in x.split() if i.isupper()]))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13,0,2,0,0


### Preprocessing And Cleaning

### Lower case conversion

In [26]:
df['twitts'] = df['twitts'].apply(lambda x: str(x).lower())
df.head()

Unnamed: 0,twitts,sentiment,word_counts,char_counts,average_word_length,stop_words_len,hashtagcount,@count,Numeric_counts,upper_words_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3,0,0,0,0
4,@mandagoforth me bad! it's funny though. zacha...,0,26,116,4.461538,13,0,2,0,0
