In [65]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [3]:
cust_df = pd.read_pickle('data/custdf_with_sentiment.pkl')

### Data cleaning and Feature Engineering

In [4]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text
cust_df.text = cust_df.text.apply(lambda x: text_to_word_list(x)) 

#### Remove stop word and frequent words

In [5]:
# Removing stop words
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

cust_df.text = cust_df.text.apply(lambda text: remove_stopwords(text))

In [6]:
# Removal of Frequent words
from collections import Counter
cnt = Counter()
for text in cust_df.text.values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(20)

[("'i',", 853975),
 ("'to',", 771631),
 ("'the',", 744907),
 ("'a',", 493053),
 ("'and',", 448686),
 ("'my',", 434684),
 ("'it',", 407698),
 ("'?',", 381233),
 ("'you',", 361107),
 ("'is',", 343349),
 ("'!',", 342802),
 ("'for',", 333595),
 ("'on',", 295011),
 ("'t',", 284058),
 ("'in',", 275225),
 ("'of',", 230983),
 ("'this',", 209919),
 ("'have',", 205099),
 ("'s',", 203414),
 ("'me',", 198363)]

In [7]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(20)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

cust_df.text = cust_df.text.apply(lambda text: remove_freqwords(text))

#### Stemming

In [8]:
# # stemming
# from nltk.stem.porter import PorterStemmer
# stemmer = PorterStemmer()
# def stem_words(text):
#     return " ".join([stemmer.stem(word) for word in text.split()])
# cust_df.text = cust_df.text.apply(lambda text: stem_words(text))

#### Conversion of Emoticon to Words
In case of use cases like sentiment analysis, the emoticons give some valuable information

In [9]:
import pickle
infile = open('data/EMOTICONS.pkl','rb')
UNICODE_EMO = pickle.load(infile)
infile.close()

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text
cust_df.text = cust_df.text.apply(lambda text: convert_emojis(text))

In [10]:
#checkpoint1
cust_df.to_pickle('data/checkpoint1.pkl')

In [11]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [15]:
sent = [" ".join(row) for row in cust_df.text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]

# Option 1: unsupervised Sentiment Analysis

The main idea behind this approach is that negative and positive words usually are surrounded by similar words. This means that if we would have movie reviews dataset, word ‘boring’ would be surrounded by the same words as word ‘tedious’, and usually such words would have somewhere close to the words such as ‘didn’t’ (like), which would also make word didn’t be similar to them. On the other hand, it would be unlikely to have happened, that word ‘tedious’ had more similar surrounding to word ‘exciting’, than to word ‘boring’. With such assumption, words could form clusters (based on similarity of their surrounding) of negative words that have similar surroundings, positive words that have similar surroundings, and some neutral words that end up between them (such as ‘movie’). 


### Word2Vec model
If two different words have very similar “contexts” (that is, what words are likely to appear around them), then our model needs to output very similar results for these two words. And one way for the network to output similar context predictions for these two words is if the word vectors are similar. So, if two words have similar contexts, then our network is motivated to learn similar word vectors for these two words

In [20]:
from time import time
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.85 mins


In [21]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 30.55 mins


In [22]:
# Saveing model 
w2v_model.save("word2vec.model")

## K-mean Clustering

In [44]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [47]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=np.double(word_vectors.vectors))

In [48]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [110]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x,dtype=np.double)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])


In [111]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

#### Saving sentiment dictonary

In [58]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

### Applying sentiments

In [125]:
sentiment_map= pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [126]:
file_weighting = cust_df.copy()

In [136]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.text)



In [137]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.text.split()))

In [142]:
%%time
replaced_tfidf_scores = file_weighting[:250].apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

CPU times: user 56 ms, sys: 8.56 ms, total: 64.6 ms
Wall time: 88.8 ms


In [143]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [151]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores[:4], replaced_tfidf_scores[:4], file_weighting[:4].text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
# replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]