In [1]:
import pandas as pd  
import numpy as np  
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df = pd.read_csv('../data/twitter16m.csv', encoding='latin1', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df = df[[5, 0]]

In [4]:
df.columns = ['tweets', 'sentiments']
df.head()

Unnamed: 0,tweets,sentiments
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [5]:
df['sentiments'].value_counts()

0    800000
4    800000
Name: sentiments, dtype: int64

In [6]:
sent_map = {0: 'negative', 4: 'positive'}

In [7]:
# word counts
df['word_counts'] = df['tweets'].apply(lambda x: len(str(x).split()))

In [8]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19
1,is upset that he can't update his Facebook by ...,0,21
2,@Kenichan I dived many times for the ball. Man...,0,18
3,my whole body feels itchy and like its on fire,0,10
4,"@nationwideclass no, it's not behaving at all....",0,21


In [9]:
# character counts
df['char_counts'] = df['tweets'].apply(lambda x: len(x))

In [10]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115
1,is upset that he can't update his Facebook by ...,0,21,111
2,@Kenichan I dived many times for the ball. Man...,0,18,89
3,my whole body feels itchy and like its on fire,0,10,47
4,"@nationwideclass no, it's not behaving at all....",0,21,111


In [11]:
# Average word length
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
    return word_len/len(words)
    
df['avg_word_len'] = df['tweets'].apply(lambda x: get_avg_word_len(x))

In [12]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444
3,my whole body feels itchy and like its on fire,0,10,47,3.7
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714


In [13]:
# Stop words count
print(STOP_WORDS)

{'become', 'latterly', 'out', 'get', 'than', 'below', 'hers', 'what', 'other', 'being', 'my', 'two', 'nine', 'ten', 'would', 'n‘t', 'three', 'wherever', 'except', 'moreover', 'less', 'herein', 'such', 'from', 'he', 'thereupon', 'everything', 'side', 'up', 'anywhere', 'yourself', 'over', 'put', 'elsewhere', 'six', 'meanwhile', 'almost', 'done', 'she', 'show', 'top', 'are', 'either', 'himself', '’d', 'much', 'thence', 'eight', 'twenty', 'cannot', 'else', 'anyhow', 'empty', 'nevertheless', 'thereafter', 'before', 'since', 'so', 'beforehand', 'hereupon', 'our', 'bottom', 'namely', 'where', 'whenever', 'last', 'because', 'throughout', 'how', 'once', 'too', 'nowhere', 'using', 'very', 'least', "'re", 'unless', 'ca', 'others', 'became', 'front', 'eleven', 'it', 'indeed', 'first', "'ve", 'enough', 'off', 'why', 'even', 'well', 'sometimes', 'here', 'otherwise', 'mine', 'to', 'one', 'another', 'third', 'per', 'regarding', 'there', 'neither', 'no', 'about', 'someone', 'nothing', 'after', 'own', '

In [14]:
df['stop_words_len'] = df['tweets'].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))

In [15]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len,stop_words_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10


In [16]:
# count #hash tags and @mentions
x = 'this is #hashtag and this is @mention'
[t for t in x.split() if t.startswith('#')]

['#hashtag']

In [17]:
df['hashtags_count'] = df['tweets'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))
df['mentions_count'] = df['tweets'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))

In [18]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1


In [19]:
# if numeric digits are present in tweets 
df['numeric_count'] = df['tweets'].apply(lambda x: len([t for t in x.split() if t .isdigit()]))

In [20]:
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1,0
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1,0


In [21]:
# upper case words count
df['upper_counts'] = df['tweets'].apply(lambda x: len([t for t in x.split() if t.isupper() and len(x)>3]))
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1,0,1


In [22]:
# Preprocessing and cleaning
# Lower case convertion
df['tweets'] = df['tweets'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,tweets,sentiments,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_counts
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his facebook by ...,0,21,111,4.285714,9,0,0,0,0
2,@kenichan i dived many times for the ball. man...,0,18,89,3.944444,7,0,1,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1,0,1


In [23]:
# contraction to expansion 
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [24]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [25]:
x = "i don't know what you want, can't, he'll, i'd"
cont_to_exp(x)

'i do not know what you want, cannot, he will, i would'

In [26]:
%%time 
df['tweets'] = df['tweets'].apply(lambda x: cont_to_exp(x))

Wall time: 18.5 s
