# Twitter Sentiment Analysis

This document contains basic techniques in text analysis. Feature extractions and finding weather sentense contains positive or nagative reviews. This is my first learning example of words analysis.

In [1]:
# importing libraries
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob, Word

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors # load Stanford Glove Model

In [2]:
# reading training data
train = pd.read_csv('../input/train_E6oV3lV.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### Number of words

In [3]:
# counting total words in sentense
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(' ')))
train[['tweet', 'word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


#### Number of characters

In [4]:
# counting total characters in each sentense with blank space
train['char_count'] = train['tweet'].str.len()
train[['tweet', 'char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [5]:
# counting average words's length
def avg_word(sentense):
    words = sentense.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet', 'avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [6]:
# counting stopwords that are in dictionary
stop = stopwords.words('english')
train['stopwords'] = train['tweet'].apply(lambda x:len([x for x in x.split() if x in stop]))
train[['tweet', 'stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [7]:
# counting total hashtags
train['hashtag'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet', 'hashtag']].head()

Unnamed: 0,tweet,hashtag
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [8]:
# counting total numbers in word
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet', 'numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [9]:
# counting upper case letters
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet', 'upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


#### Basic preprocessing

In [10]:
# caps small in all texts
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [11]:
# removes comma
train['tweet'] = train['tweet'].str.replace('[^\w\s]', '')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [12]:
# removing stop words like, a, is, your, i 
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [13]:
# finding most used words
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
list(freq.index)

['user', 'love', 'ð', 'day', 'â', 'happy', 'amp', 'im', 'u', 'time']

In [14]:
# removing most used words
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [15]:
# removing less used words
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq = list(freq)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Correcting spelling using TextBlob

In [16]:
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [17]:
TextBlob(train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [18]:
# stemming used to remove ing, es,
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [19]:
# lemmatize get the base form of word. It is better then stemming
train['tweet'] = train['tweet'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [20]:
# getting common together used words
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [21]:
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
tf1.columns = ['words', 'tf']
tf1

Unnamed: 0,words,tf
0,van,1
1,wheelchair,1
2,dont,1
3,lyft,1
4,disapointed,1
5,cant,1
6,thanks,1
7,credit,1
8,pdx,1
9,cause,1


In [22]:
# more value of idf, more unique word
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/len(train[train['tweet'].str.contains(word)]))

tf1

Unnamed: 0,words,tf,idf
0,van,1,5.236505
1,wheelchair,1,9.273691
2,dont,1,3.745585
3,lyft,1,8.762865
4,disapointed,1,10.372303
5,cant,1,3.538194
6,thanks,1,4.597751
7,credit,1,7.327781
8,pdx,1,8.762865
9,cause,1,5.690172


In [23]:
# we are multiplies the tf and idf. to remove most used unused words
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,van,1,5.236505,5.236505
1,wheelchair,1,9.273691,9.273691
2,dont,1,3.745585,3.745585
3,lyft,1,8.762865,8.762865
4,disapointed,1,10.372303,10.372303
5,cant,1,3.538194,3.538194
6,thanks,1,4.597751,4.597751
7,credit,1,7.327781,7.327781
8,pdx,1,8.762865,8.762865
9,cause,1,5.690172,5.690172


In [24]:
# we can compute tfidf with feature_selection.text's TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 1000, lowercase = True, analyzer = 'word', stop_words = 'english', ngram_range=(1, 1))
train_vect = tfidf.fit_transform(train['tweet'])
train_vect

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 114042 stored elements in Compressed Sparse Row format>

In [25]:
# bag of words, finding meaning of words
bow = CountVectorizer(max_features = 1000, lowercase = True, ngram_range = (1,1), analyzer = 'word')
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128387 stored elements in Compressed Sparse Row format>

In [26]:
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.0, 0.0)
4                    (0.0, 0.0)
Name: tweet, dtype: object

In [27]:
# positive value means positive reviews, negative means nagative reviews
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
train[['tweet', 'sentiment']].head()

Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drag kid dysfunct...,-0.3
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0


In [28]:
# word embedding, I downloaded txt from
# https://www.kaggle.com/terenceliu4444/glove6b100dtxt#glove.6B.100d.txt link
glove_input_file = '../input/glove.6B.100d.txt'
word2vec_output_file = '../input/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
# now we can use this word2vec file as model

(400000, 100)

In [29]:
filename = '../input/glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename,binary = False)

In [30]:
# testing model
model['go'] # use other words here

array([-0.078894,  0.4616  ,  0.57779 , -0.71637 , -0.13121 ,  0.4186  ,
       -0.29156 ,  0.52006 ,  0.089986, -0.35062 ,  0.51755 ,  0.51998 ,
        0.15218 ,  0.41485 , -0.12377 , -0.37222 ,  0.0273  ,  0.75673 ,
       -0.8739  ,  0.58935 ,  0.46662 ,  0.62918 ,  0.092603, -0.012868,
       -0.015169,  0.25567 , -0.43025 , -0.77668 ,  0.71449 , -0.3834  ,
       -0.69638 ,  0.23522 ,  0.11396 ,  0.02778 ,  0.071357,  0.87409 ,
       -0.1281  ,  0.063576,  0.067867, -0.50181 , -0.28523 , -0.072536,
       -0.50738 , -0.6914  , -0.53579 , -0.11361 , -0.38234 , -0.12414 ,
        0.011214, -1.1622  ,  0.037057, -0.18495 ,  0.01416 ,  0.87193 ,
       -0.097309, -2.3565  , -0.14554 ,  0.28275 ,  2.0053  ,  0.23439 ,
       -0.38298 ,  0.69539 , -0.44916 , -0.094157,  0.90527 ,  0.65764 ,
        0.27628 ,  0.30688 , -0.57781 , -0.22987 , -0.083043, -0.57236 ,
       -0.299   , -0.81112 ,  0.039752, -0.05681 , -0.48879 , -0.18091 ,
       -0.28152 , -0.20559 ,  0.4932  , -0.033999, 

In [31]:
model['away']

array([-0.10379 , -0.014792,  0.59933 , -0.51316 , -0.036463,  0.6588  ,
       -0.57906 ,  0.17819 ,  0.23663 , -0.21384 ,  0.55339 ,  0.53597 ,
        0.041444,  0.16095 ,  0.017093, -0.37242 ,  0.017974,  0.39268 ,
       -0.23265 ,  0.1818  ,  0.66405 ,  0.98163 ,  0.42339 ,  0.030581,
        0.35015 ,  0.25519 , -0.71182 , -0.42184 ,  0.13068 , -0.47452 ,
       -0.08175 ,  0.1574  , -0.13262 ,  0.22679 , -0.16885 , -0.11122 ,
       -0.32272 , -0.020978, -0.43345 ,  0.172   , -0.67366 , -0.79052 ,
        0.10556 , -0.4219  , -0.12385 , -0.063486, -0.17843 ,  0.56359 ,
        0.16986 , -0.17804 ,  0.13956 , -0.20169 ,  0.078985,  1.4497  ,
        0.23556 , -2.6014  , -0.5286  , -0.11636 ,  1.7184  ,  0.33254 ,
        0.12136 ,  1.1602  , -0.2914  ,  0.47125 ,  0.41869 ,  0.35271 ,
        0.47869 , -0.042281, -0.18294 ,  0.1796  , -0.24431 , -0.34042 ,
        0.20337 , -0.93676 ,  0.013077,  0.080339, -0.36604 , -0.44005 ,
       -0.35393 ,  0.15907 ,  0.55807 ,  0.1492  , 

In [32]:
model['go'] + model['away']/2

array([-1.30788997e-01,  4.54203993e-01,  8.77454996e-01, -9.72949982e-01,
       -1.49441496e-01,  7.48000026e-01, -5.81089973e-01,  6.09154999e-01,
        2.08300993e-01, -4.57540005e-01,  7.94245005e-01,  7.87965000e-01,
        1.72902003e-01,  4.95324999e-01, -1.15223497e-01, -5.58430016e-01,
        3.62870023e-02,  9.53070045e-01, -9.90225017e-01,  6.80249989e-01,
        7.98645020e-01,  1.11999500e+00,  3.04297984e-01,  2.42249947e-03,
        1.59906000e-01,  3.83265018e-01, -7.86159992e-01, -9.87599969e-01,
        7.79829979e-01, -6.20660007e-01, -7.37255037e-01,  3.13919991e-01,
        4.76499945e-02,  1.41175002e-01, -1.30680054e-02,  8.18480015e-01,
       -2.89460003e-01,  5.30869961e-02, -1.48858011e-01, -4.15810019e-01,
       -6.22060001e-01, -4.67795998e-01, -4.54600006e-01, -9.02350008e-01,
       -5.97715020e-01, -1.45353004e-01, -4.71555024e-01,  1.57655001e-01,
        9.61440057e-02, -1.25121999e+00,  1.06837004e-01, -2.85795003e-01,
        5.36524989e-02,  

#### From:- [Ultimate guide to deal with Text Data (using Python) – for Data Scientists & Engineers](https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/)
