In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, os, sys
# pd.set_option('display.max_colwidth', 200)

In [2]:
df = pd.read_csv("Datasets/train_tweets.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.shape

(31962, 3)

In [4]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [5]:
df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split(" ")))
df[['tweet', 'word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [6]:
df.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [7]:
df[['tweet', 'word_count']].head(10)

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8
5,[2/2] huge fan fare and big talking before the...,21
6,@user camping tomorrow @user @user @user @use...,12
7,the next school year is the year for exams.ð...,23
8,we won!!! love the land!!! #allin #cavs #champ...,13
9,@user @user welcome here ! i'm it's so #gr...,15


In [8]:
df['char_count'] = df['tweet'].str.len()
df[['tweet', 'char_count']].head(10)

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39
5,[2/2] huge fan fare and big talking before the...,116
6,@user camping tomorrow @user @user @user @use...,74
7,the next school year is the year for exams.ð...,143
8,we won!!! love the land!!! #allin #cavs #champ...,87
9,@user @user welcome here ! i'm it's so #gr...,50


In [9]:
def avg_word_count(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

df["avg_word"] = df['tweet'].apply(lambda x: avg_word_count(x))
df[['tweet', 'avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [10]:
from nltk.corpus import stopwords
stop=stopwords.words("english")
df['stopwords'] = df['tweet'].apply(lambda x: len(
    [x for x in x.split() if x in stop]
))
df[['tweet', 'stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [11]:
df['special_chars'] = df['tweet'].apply(lambda x: len(
    [x for x in x.split() if x.startswith('#') or x.isdigit() or x.startswith('@')]
))
df[['tweet', 'special_chars']].head()

Unnamed: 0,tweet,special_chars
0,@user when a father is dysfunctional and is s...,2
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [12]:
df['numerics'] = df['tweet'].apply(lambda x: len(
    [x for x in x.split() if x.isdigit()]
))
df['upper'] = df['tweet'].apply(lambda x: len(
    [x for x in x.split() if x.isupper()]
))

df[['tweet', 'numerics', 'upper']].sample(10)

Unnamed: 0,tweet,numerics,upper
19193,dang i just barely missed out on the @user qna...,0,0
10909,"10007 #love, #happy, , #fail ""i must say my...",1,0
28493,when someone with 9.3 followers steals your id...,1,0
7270,@user @user truly truly pathetic in so many wa...,0,0
221,i want to teach you love like you've never fel...,0,0
14684,woken up in a foul mood #moody #thingsgowrong,0,0
11877,"#thursday everyone, #workflow #selfie cause ...",0,0
20853,there is no better feeling than putting out go...,0,0
27531,best friend bihday brunch! ðððð #...,0,0
6570,blending with nature&gt;&gt;link in bio#shoot2...,0,0


### Basic Pre-Processing

First preprocessing is to convert all our tweets into lowercase:

In [13]:
df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
df["tweet"].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

Next, we have to __remove punctuations:__

In [14]:
df['tweet'] = df['tweet'].str.replace('[^\w\s]', '')
df['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

__removing stop words__

In [15]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

#### common words removal

In [16]:
freq = pd.Series(' '.join(df['tweet']).split()).value_counts()[:10]
freq

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [17]:
freq = list(freq.index)
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

__rare words Removal:__

In [18]:
freq = pd.Series(' '.join(df['tweet']).split()).value_counts()[-10:]
freq

kimi7         1
hedni         1
worldðð       1
q2            1
louaners      1
wpap          1
bcci          1
8056108633    1
ððððððððâº    1
guidean       1
dtype: int64

In [19]:
freq = list(freq.index)
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

#### Speeling Correction

In [20]:
from textblob import TextBlob
df['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Tokenization

In [21]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [22]:
from textblob import Word
df['tweet'][:5].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

## Advance Text Preprocessing

In [23]:
df['tweet'][0]

'father dysfunctional selfish drags kids dysfunction run'

In [24]:
TextBlob(df['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drags']),
 WordList(['drags', 'kids']),
 WordList(['kids', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [25]:
TextBlob(df['tweet'][0]).ngrams(3)

[WordList(['father', 'dysfunctional', 'selfish']),
 WordList(['dysfunctional', 'selfish', 'drags']),
 WordList(['selfish', 'drags', 'kids']),
 WordList(['drags', 'kids', 'dysfunction']),
 WordList(['kids', 'dysfunction', 'run'])]

## Understanding TF-IDF

### 1. Term Frequency (TF)

In [26]:
tf1 = (df['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
tf1.columns = ['words', 'term frequency']
tf1

Unnamed: 0,words,term frequency
0,thanks,1
1,vans,1
2,credit,1
3,cause,1
4,disapointed,1
5,pdx,1
6,lyft,1
7,cant,1
8,wheelchair,1
9,dont,1


### 2. Inverse-document frequency (IDF)

In [27]:
for i, word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(df.shape[0]/(len(df[df['tweet'].str.contains(word)])))
tf1

Unnamed: 0,words,term frequency,idf
0,thanks,1,4.597751
1,vans,1,8.426393
2,credit,1,7.327781
3,cause,1,5.690172
4,disapointed,1,10.372303
5,pdx,1,8.762865
6,lyft,1,8.762865
7,cant,1,3.538194
8,wheelchair,1,9.273691
9,dont,1,3.745585


### TF-IDF

TF-IDF is the multiplication of TF and IDF.

In [28]:
tf1['tf-idf'] = tf1['term frequency'] * tf1['idf']
tf1

Unnamed: 0,words,term frequency,idf,tf-idf
0,thanks,1,4.597751,4.597751
1,vans,1,8.426393,8.426393
2,credit,1,7.327781,7.327781
3,cause,1,5.690172,5.690172
4,disapointed,1,10.372303,10.372303
5,pdx,1,8.762865,8.762865
6,lyft,1,8.762865,8.762865
7,cant,1,3.538194,3.538194
8,wheelchair,1,9.273691,9.273691
9,dont,1,3.745585,3.745585


## Bag Of Words

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1, 1),analyzer='word')
df_bow = bow.fit_transform(df['tweet'])
df_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 123023 stored elements in Compressed Sparse Row format>

## Sentiment Analysis

In [30]:
df['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.5, 1.0)
1     (0.2, 0.2)
2     (0.0, 0.0)
3     (0.0, 0.0)
4     (0.0, 0.0)
Name: tweet, dtype: object

In [31]:
df['sentiment'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
df[['tweet', 'sentiment']].head()

Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drags kids dysfun...,-0.5
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0


## Word Embeddings

In [32]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors ## Loads standard Glove Model 
filename = "glove.6B.100d.word2vec"
model = KeyedVectors.load_word2vec_format(filename, binary=False)
model['go']

In [None]:
(model["go"] + model['away'])/2