In [81]:
import pandas as pd

train = pd.read_csv("data/train_E6oV3lV.csv")

## Basic Feature Extraction

### Number of Words

In [82]:
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,@user when a father is dysfunctional and is s...,21
1,2,0,@user @user thanks for #lyft credit i can't us...,22
2,3,0,bihday your majesty,5
3,4,0,#model i love u take with u all the time in ...,17
4,5,0,factsguide: society now #motivation,8


### Number of characters

In [83]:
train['char_count'] = train['tweet'].str.len()
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count
0,1,0,@user when a father is dysfunctional and is s...,21,102
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122
2,3,0,bihday your majesty,5,21
3,4,0,#model i love u take with u all the time in ...,17,86
4,5,0,factsguide: society now #motivation,8,39


### Average Word Length

In [84]:
def mean_words(sentence):
    words = sentence.split()
    return sum(len(word) for word in words) / len(words)

train['avg_words'] = train['tweet'].apply(lambda t: mean_words(t))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789
2,3,0,bihday your majesty,5,21,5.666667
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571
4,5,0,factsguide: society now #motivation,8,39,8.0


### Number of stopwords

In [85]:
from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda t: len([t for t in t.split() if t in en_stopwords]))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556,10
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5
2,3,0,bihday your majesty,5,21,5.666667,1
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571,5
4,5,0,factsguide: society now #motivation,8,39,8.0,1


### Number of special characters

In [86]:
train['hashtags'] = train['tweet'].apply(lambda t: len([t for t in t.split() if t.startswith("#")]))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556,10,1
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5,3
2,3,0,bihday your majesty,5,21,5.666667,1,0
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571,5,1
4,5,0,factsguide: society now #motivation,8,39,8.0,1,1


### Number of numerics

In [87]:
train['numerics'] = train['tweet'].apply(lambda t: len([t for t in t.split() if t.isdigit()]))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags,numerics
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556,10,1,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5,3,0
2,3,0,bihday your majesty,5,21,5.666667,1,0,0
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571,5,1,0
4,5,0,factsguide: society now #motivation,8,39,8.0,1,1,0


### Number of Uppercase words

In [88]:
train['upper'] = train['tweet'].apply(lambda t: len([t for t in t.split() if t.isupper()]))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags,numerics,upper
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556,10,1,0,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5,3,0,0
2,3,0,bihday your majesty,5,21,5.666667,1,0,0,0
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571,5,1,0,0
4,5,0,factsguide: society now #motivation,8,39,8.0,1,1,0,0


## Basic Pre-processing

### Lower case

In [89]:
train['tweet'] = train['tweet'].apply(lambda t: " ".join(t.lower() for t in t.split()))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags,numerics,upper
0,1,0,@user when a father is dysfunctional and is so...,21,102,4.555556,10,1,0,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5,3,0,0
2,3,0,bihday your majesty,5,21,5.666667,1,0,0,0
3,4,0,#model i love u take with u all the time in ur...,17,86,4.928571,5,1,0,0
4,5,0,factsguide: society now #motivation,8,39,8.0,1,1,0,0


### Removing Punctuation

In [92]:
train['tweet'] = train['tweet'].str.replace('[^\w\s]', '', regex=True)
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags,numerics,upper
0,1,0,user when a father is dysfunctional and is so ...,21,102,4.555556,10,1,0,0
1,2,0,user user thanks for lyft credit i cant use ca...,22,122,5.315789,5,3,0,0
2,3,0,bihday your majesty,5,21,5.666667,1,0,0,0
3,4,0,model i love u take with u all the time in urð...,17,86,4.928571,5,1,0,0
4,5,0,factsguide society now motivation,8,39,8.0,1,1,0,0


### Removal of Stop Words

In [93]:
train['tweet'] = train['tweet'].apply(lambda t: " ".join(t for t in t.split() if t not in en_stopwords))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_words,stopwords,hashtags,numerics,upper
0,1,0,user father dysfunctional selfish drags kids d...,21,102,4.555556,10,1,0,0
1,2,0,user user thanks lyft credit cant use cause do...,22,122,5.315789,5,3,0,0
2,3,0,bihday majesty,5,21,5.666667,1,0,0,0
3,4,0,model love u take u time urð ðððð ððð,17,86,4.928571,5,1,0,0
4,5,0,factsguide society motivation,8,39,8.0,1,1,0,0


### Common word removal

In [94]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda t: " ".join(t for t in t.split() if t not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Rare words removal

In [95]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda t: " ".join(t for t in t.split() if t not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Spelling correction

In [99]:
from textblob import TextBlob

train['tweet'] = train['tweet'].apply(lambda t: str(TextBlob(t)))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Stemming

In [100]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
train['tweet'] = train['tweet'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
train['tweet'].head()

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

### Lemmatization