In [1]:
import pyprind
import pandas as pd
import numpy as np
import os
import nltk

In [2]:
# the Twitter message data is stored TWT Data1 CSV file and loaded to a dataframe df   (saved as UTF)
df=pd.read_csv('/Users/liliyang/Desktop/Columbia Classes/Big Data/Datasets/TWT Data1.csv')

In [3]:
df.head(10)

Unnamed: 0,Person ID,Message,Sentiment
0,1,I followed those and was not offered to speak ...,Neutral
1,2,"Hi, I paid my payment 2 days ago and would be ...",Negative
2,3,I just gotta let the world know. @CreditOneBan...,Negative
3,4,@NASCARSW42 @KyleLarsonRacin @CreditOneBank @c...,Positive
4,5,@NASCARSW42 @kyleLarsonRacin @CreditOneBank @c...,Negative
5,6,Checkered Flag! @JoshParker81 brings the No.42...,Positive
6,7,@NASCARSW42 @KyleLarsonRacin @CreditoneBgnk @c...,Neutral
7,8,"I called to complain about my card, i tried to...",Negative
8,9,ch@CocaColaRacing @CreditOneBank @NASCAR,Neutral
9,10,https: //twitter.com/JohnPaysor/status/1159533...,Neutral


In [4]:
df.dtypes

Person ID     int64
Message      object
Sentiment    object
dtype: object

In [5]:
len(df['Message'])

311

In [6]:
df.isnull().sum()

Person ID    0
Message      0
Sentiment    0
dtype: int64

In [7]:
df.loc[10, 'Message'][0:]

"@CreditOneBank My card still hasn't shown up in the mail, it's been 9 days or so since being approved"

In [8]:
#remove unknown characters and convert capital letters into lowercase
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    text = text.replace('â', '')

    return text

In [9]:
preprocessor(df.loc[10, 'Message'][0:])

' creditonebank my card still hasn t shown up in the mail it s been 9 days or so since being approved'

In [10]:
df['Message'] = df['Message'].apply(preprocessor)

In [11]:
print(df['Message'][10:30])

10     creditonebank my card still hasn t shown up i...
11       try calling me again didn t not recognize the 
12    hi my card still hasn t shown up in the mail i...
13                                   creditonebank sent
14    thank you for your help the explanation made t...
15     creditonebank your website needs to get it to...
16     cards2p kylelarsonracin creditonebank cgrteam...
17         how do i speak directly to a representative 
18                 i want to make a out of u s purchase
19    i called and the automated system doesn t give...
20    id like to speak directly to a representative ...
21    i just received a mail today saying i was pre ...
22    i tried calling and there s no option to talk ...
23                              everything is automated
24    hi i received a letter requesting additional i...
25    hi i m wondering why the available amount on m...
26              creditonebank https twitter com cp24 st
27    hi i want to deactivate my credit one card

In [12]:
#Use NLTK library to do tokenization

In [13]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [14]:
tokenizer('creditonebank my card still hasn t shown up in the mail it s been 9 days or so since being approved')

['creditonebank',
 'my',
 'card',
 'still',
 'hasn',
 't',
 'shown',
 'up',
 'in',
 'the',
 'mail',
 'it',
 's',
 'been',
 '9',
 'days',
 'or',
 'so',
 'since',
 'being',
 'approved']

In [15]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer('creditonebank my card still hasn t shown up in the mail it s been 9 days or so since being approved')[0:]
if w not in stop]

['creditonebank',
 'card',
 'still',
 'shown',
 'mail',
 '9',
 'days',
 'since',
 'approved']

In [16]:
#count the tokens in every sentence
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(df['Message'])
bag = count.fit_transform(docs)

In [17]:
bag.shape

(311, 1386)

In [18]:
#convert the result to the TF-IDF matrix
#goal: apply rate values for tokens depending on how many times they appear
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
X_train = df.loc[:200, 'Message'].values
y_train = df.loc[:200, 'Sentiment'].values
X_test = df.loc[200:, 'Message'].values
y_test = df.loc[200:, 'Sentiment'].values

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
#!pip install wordcloud

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import WordCloud, STOPWORDS

In [None]:
combined_text = " ".join([review for review in df['Message']])

In [None]:
# Initialize wordcloud object
wc = WordCloud(background_color='white', max_words=50,
        # update stopwords to include common words like the, in, was.....
        stopwords = STOPWORDS.update(['the','in','was','my','creditonebank','account','hi']))

# Generate and plot wordcloud
plt.imshow(wc.generate(combined_text))
plt.axis('off')
plt.show()