**Project plan**

*Data preprocessing
1. DONE - Detect the english tweets
https://github.com/rfk/pyenchant

*Sentiment Analysis*
1. Find a sentiment analysis NN to rate last tweets
2. DONE - Extract the emoji's and incorporate the emoji sentiment <br>
https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors<br>
https://stackoverflow.com/questions/48340622/extract-all-emojis-from-string-and-ignore-fitzpatrick-modifiers-skin-tones-etc

*Build a pipeline for similar tweet recommender*
1. DONE - Implement class for recommend engine
2. Build a pipeline for word embedding, vectorizing, and recommendation
3. Try out Glove, Google embedding.. etc

In [104]:
import string
import emoji
import regex
import pickle
import re
import enchant
import math
import inflect

import numpy as np
import pandas as pd
import vaderSentiment

import matplotlib.pyplot as plt

from itertools import compress
from tqdm import tqdm_notebook
from termcolor import colored


# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF, TruncatedSVD
import nltk.sentiment.vader
from sklearn.neighbors import NearestNeighbors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
    
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize


# logging for gensim (set to INFO)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

%matplotlib inline

In [4]:
twitter = pd.read_csv("./customer-support-on-twitter/twcs.csv")

# Save original message for future recommendation display
ticket_reserve = twitter[twitter['inbound'] == True]
solution_reserve = twitter[twitter['inbound'] == False]

### Exclude tweets with non-English characters

In [119]:
def fixContractions(text):
    with open('./data/helper/contraction_list.pkl', 'rb') as picklefile:
        cList = pickle.load(picklefile)
    for word in text.split():
        if word.lower() in cList:
            text = text.replace(word, cList[word.lower()])
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https:\/\/t.co\/\w{10}', '', text) # Remove URL link
    text = re.sub(r'@\w+', '', text) # Remove @account
    text = fixContractions(text) # Expand contractions
    return text

In [121]:
def get_emoji_free_text(text):
    text = text.replace(u'\u200d', '')
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    if (len(emoji_list) == 0):
        return np.NaN, clean_text
    else:
        return emoji_list, clean_text

In [122]:
twitter['text'] = twitter['text'].apply(lambda x:clean_text(x))
twitter[['emoji','text']] = twitter['text'].apply(lambda x:pd.Series(get_emoji_free_text(x)))

In [123]:
with open('./data/processed/tweets_step1_emoji_extracted.pkl', 'wb')as picklefile:
    pickle.dump(twitter, picklefile)

In [None]:
with open('./data/processed/tweets_step1_emoji_extracted.pkl', 'rb')as picklefile:
    twitter = pickle.load(picklefile)

### Retrieve sentiment score for emoji's
ref:http://kt.ijs.si/data/Emoji_sentiment_ranking/

In [125]:
def build_dict(emoji_table, emoji_sentiment_dict):
    type_set = set()
    for i in range(emoji_table.shape[0]):
        code = emoji_table.loc[i, 'Python Code']
        try:
            key = chr(int(code[1:], 16))
            emoji_sentiment_dict[key] = emoji_table.loc[i, 'Sentiment score']
        except:
            type_set.add(emoji_table.loc[i, 'Unicode block'])
    return emoji_sentiment_dict

In [127]:
def get_emoji_sentiment(emoji_list):
    score = 0
    if (type(emoji_list) != float):
        for emoji_item in emoji_list:
            if (emoji_item in emoji_sentiment_dict.keys()):
                score += emoji_sentiment_dict[emoji_item]
        if (score == 0):
            return np.NaN
        else:
            return score/len(emoji_list)
    else:
        return np.NaN

In [169]:
emoji_sentiment = pd.read_csv('./data/raw/emoji_nodingbat.csv')
emoji_other = pd.read_csv('./data/raw/emoji_other.csv')
emoji_sentiment_dict = {}

emoji_sentiment_dict = build_dict(emoji_sentiment, emoji_sentiment_dict)
emoji_sentiment_dict = build_dict(emoji_other, emoji_sentiment_dict)

twitter['emoji_score'] = twitter['emoji'].apply(lambda x:get_emoji_sentiment(x))
print ('Number of tweets with emoji score: ', twitter[~twitter['emoji_score'].isnull()].shape[0])
print ('Number of tweets with emoji but not score: ', twitter[twitter['emoji_score'].isnull() & (~twitter['emoji'].isnull())].shape[0])

Number of tweets with emoji score:  161402
Number of tweets with emoji but not score:  33223


In [129]:
with open('./data/processed/tweets_step2_emoji_sentiment.pkl', 'wb')as picklefile:
    pickle.dump(twitter, picklefile)

In [167]:
with open('./data/processed/tweets_step2_emoji_sentiment.pkl', 'rb')as picklefile:
    twitter = pickle.load(picklefile)

### Remove non-English tweets

In [170]:
# Retrieve elements that are printable in English context - This is a 
# combination of digits, letters, punctuation, and whitespace.
valid_content = set(string.printable)

# Include some common non-English punctuations
non_English_punctuation = ['‘', '’', 'é', '–','—','“','”','、','。','`','️', '£', '…', '$']

# Keep tweets with common non-English punctuations
[valid_content.add(x) for x in (non_English_punctuation)]
    
def isEnglish(tweet):
    invalid_index = [x not in valid_content for x in tweet]
    if (np.sum(invalid_index)==0):
        return True
    else:
        return False

## Removed 78k tweets with non-English characters
english_tweets = twitter.text.apply(lambda x:isEnglish(x))
tweets = twitter.loc[english_tweets, :]

In [171]:
# A quick tool to return non-English words
def non_English(words):
    print (list(compress(list(words), [x not in valid_content for x in words])))

### Format the dates

In [173]:
# Format the dates
tweets['created_at'] = pd.to_datetime(tweets['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [174]:
tweets['in_response_to_tweet_id'] = tweets['in_response_to_tweet_id'].fillna(-1)
tweets['in_response_to_tweet_id'] = tweets['in_response_to_tweet_id'].astype(int)
tweets['response_tweet_id'] = tweets['response_tweet_id'].fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [175]:
with open('./data/processed/tweets_step3_english.pkl', 'wb')as picklefile:
    pickle.dump(tweets, picklefile)

In [176]:
with open('./data/processed/tweets_step3_english.pkl', 'rb')as picklefile:
    tweets = pickle.load(picklefile)

### Add custom_id and brand_id columns

In [198]:
# Add custom_id column
tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),left_on='in_response_to_tweet_id', right_on='tweet_id', how='left')
tweets.loc[tweets['inbound'] == True, 'author_id_y'] = tweets.loc[tweets['inbound'] == True, 'author_id_x']
tweets['author_id_y'] = tweets['author_id_y'].fillna(-1)
tweets.drop('tweet_id_y', axis=1, inplace=True)
tweets.rename(columns={'author_id_y':'cust_id', 
                       'author_id_x':'author_id',
                      'tweet_id_x':'tweet_id'}, inplace=True)

# Drop the tweets without customer id, those are usually promotional tweets initiated by
# brands agent or customers not in the datase: Number of such tweets: 71255
tweets = tweets[~tweets['cust_id'].isnull()]

In [227]:
# Add brand_id column

# Carve out the first tweet id when there are multiples in response_tweet_id
tweets['processed_response_id'] = tweets['response_tweet_id'].apply(
    lambda x: x if (isinstance(x,int)) else x.split(',')[0])
tweets['processed_response_id'] = tweets['processed_response_id'].astype(int)

# Retrieve the author id of the responde tweet
tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),
                  left_on='processed_response_id', 
                  right_on='tweet_id', 
                  how='left')

tweets.drop('tweet_id_y', axis=1, inplace=True)
tweets.rename(columns={'author_id_y':'brand_id', 
                       'author_id_x':'author_id',
                      'tweet_id_x':'tweet_id'}, inplace=True)

tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),left_on='in_response_to_tweet_id', right_on='tweet_id', how='left')
tweets.loc[(tweets['inbound'] & (tweets['processed_response_id'] == -1)), 'brand_id'] = (
    tweets.loc[(tweets['inbound'] & (tweets['processed_response_id'] == -1)), 'author_id_y'])
tweets.rename(columns={'tweet_id_x':'tweet_id', 
                       'author_id_x':'author_id'}, inplace=True)

tweets.loc[tweets['inbound'] == False, 'brand_id'] = tweets.loc[tweets['inbound'] == False, 'author_id']
tweets.drop(['processed_response_id', 'tweet_id_y', 'author_id_y'], axis=1, inplace=True)

# could use some more logic to extract the account name in tweet text

In [233]:
# Tweets with no response is 3880
print (tweets[tweets['brand_id'].isnull() & (tweets['response_tweet_id'] == -1)].shape)

# Tweets with response outside the dataset is 42764
print (tweets[tweets['brand_id'].isnull() & (tweets['response_tweet_id'] != -1)].shape)

# Drop the tweets without brand_id
tweets = tweets[~tweets['brand_id'].isnull()]

In [6]:
# Identify which tweets are in the same conversation (customer case)
tweets.sort_values(['cust_id', 'created_at'], inplace=True)
tweets.reset_index(inplace=True)

# Calculate the time difference (in seconds) bwetween a tweet and its previous tweet
tweets['time_diff'] = tweets['created_at'].diff()
tweets['time_diff'] = tweets['time_diff'].apply(lambda x:x.total_seconds())
tweets['time_diff'] = tweets['time_diff'].apply(lambda x: 0 if x <0 else x)
tweets['time_diff'] = tweets['time_diff'].fillna(0)

tweets['case_id'] = 0
tweets['time_diff'] = tweets['time_diff'].fillna(0) 

In [None]:
with open('./data/processed/tweets_step4_brand_cust.pkl', 'wb')as picklefile:
    pickle.dump(tweets, picklefile)

In [None]:
with open('./data/processed/tweets_step4_brand_cust.pkl', 'rb')as picklefile:
    tweets = pickle.load(picklefile)

### Carve out the support data for a particular brand

In [7]:
def analyze_brand(tweets, brand_cc_name):
    ## Subset tweets according to brand
    brand = tweets[tweets['brand_id'] == brand_cc_name]
    brand.reset_index(inplace=True)
    
    ## Assign unique caseid for each conversation (interchange of tweets)
    ## To avoid grouping different cases raised by the same customer at different time
    ## We set the max threshold between subsequent tweets in a conversation as 2 days
    caseid = 1
    brand.loc[0, 'case_id'] = caseid
    for row in range(1, brand.shape[0]):
        if (row%1000 == 0):
            print (row)
        if ((brand.loc[row, 'cust_id'] != brand.loc[row-1, 'cust_id']) |
            (brand.loc[row, 'time_diff'] > 3600*24*2)):
            caseid = caseid + 1
            brand.loc[row, 'time_diff'] = 0 # Set time between conversations as 0
            brand.loc[row, 'case_id'] = caseid
        else:
            brand.loc[row, 'case_id'] = caseid
    return brand

In [8]:
# CAUTION! This function takes SUPER LONG TIME to run, 80k row takes 3 hours!
spotify = analyze_brand(tweets, 'SpotifyCares')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000


In [9]:
with open('./data/processed/spotify.pkl', 'wb')as picklefile:
    pickle.dump(spotify, picklefile)

In [40]:
with open('./data/processed/spotify.pkl', 'rb')as picklefile:
    spotify = pickle.load(picklefile)

### Remove ticket without solution and solution without ticket

In [41]:
conv_count = spotify.groupby('case_id')['case_id'].count()
single_conv_index = conv_count[conv_count==1].index.tolist()
spotify['single_conv'] = spotify['case_id'].apply(
    lambda x: True if x in single_conv_index else False)

# Remove the conversations with only one tweet (424 ) 
spotify = spotify[~spotify['single_conv']]
spotify.drop('single_conv', axis=1, inplace=True)

spotify.drop(['level_0', 'index'], axis=1, inplace=True)

In [42]:
ticket = spotify[spotify['inbound'] == True]
solution = spotify[spotify['inbound'] == False]

In [43]:
# Remove ticket without solution or solution without ticket 
# conversations with tweets from only customer or customer agents

ticket = ticket.apply(
    lambda x: x if (x[12] in solution['case_id'].unique()) else None, axis = 1)
solution = solution.apply(
    lambda x: x if (x[12] in ticket['case_id'].unique()) else None, axis = 1)

In [44]:
#ticket.text.fillna('', inplace=True)
#solution.text.fillna('', inplace=True)
ticket_agg = ticket.groupby('case_id')['text'].agg(lambda x:' '.join(x))
solution_agg = solution.groupby('case_id')['text'].agg(lambda x:' '.join(x))

In [None]:
spotify = spotify.apply(
    lambda x: x if (x[10] in ticket['case_id'].unique()) else None, axis=1)

In [None]:
with open('./data/processed/ticket.pkl', 'wb')as picklefile:
    pickle.dump(ticket, picklefile)
with open('./data/processed/solution.pkl', 'wb')as picklefile:
    pickle.dump(solution, picklefile)
with open('./data/processed/ticket_agg.pkl', 'wb')as picklefile:
    pickle.dump(ticket_agg, picklefile)
with open('./data/processed/solution_agg.pkl', 'wb')as picklefile:
    pickle.dump(solution_agg, picklefile)

### Perform very basic sentiment analysis on last tweet from customer

In [54]:
cust_last_tweet = ticket.groupby('case_id')[['text','emoji', 'emoji_score']].last().reset_index()

In [55]:
vdanalyzer = vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer()
cust_last_tweet['vader_sentiment'] = cust_last_tweet['text'].apply(
        lambda x:vdanalyzer.polarity_scores(x)['compound'])

In [56]:
sentiment_analyzer = nltk.sentiment.vader.SentimentIntensityAnalyzer()
cust_last_tweet['ntlk_sentiment'] = cust_last_tweet['text'].apply(
        lambda x:sentiment_analyzer.polarity_scores(x)['compound'])

In [58]:
cust_last_tweet.to_csv('./data/processed/processed_last_tweet_sentiment.csv', index=False)

In [60]:
cust_last_tweet = pd.read_csv('./data/processed/processed_last_tweet_sentiment.csv', index_col=None)

In [62]:
pd.set_option('display.max_colwidth', -1)
print (ticket[ticket['tweet_id'] == 2606351.0].text.astype(str))

6    taking features away will only make it easier for users to go. i’ve been a premium user for 4+ years... don’t let me leave.
Name: text, dtype: object


In [63]:
def get_overall_sentiment_score(row):
    sentiment = ''
    if (not math.isnan(row[3])):
        if (row[3] >= 0.2):
            sentiment = 'POS'
        elif ((row[3] < 0.2) & (row[3] > -0.05)):
            sentiment = 'NEU'
        elif (row[3] <= -0.05):
            sentiment = 'NEG'
    elif (not math.isnan(row[5])):
        if (row[5] >= 0.2):
            sentiment = 'POS'
        elif ((row[5] < 0.2) & (row[5] > -0.05)):
            sentiment = 'NEU'
        elif (row[5] <= -0.05):
            sentiment = 'NEG'
    return sentiment

In [64]:
cust_last_tweet[['emoji_score', 'ntlk_sentiment']] = cust_last_tweet[['emoji_score', 'ntlk_sentiment']].astype(float)
cust_last_tweet['sentiment'] = cust_last_tweet.apply(lambda
                        x:get_overall_sentiment_score(x), axis=1)

In [67]:
cust_last_tweet.to_csv('./data/processed/processed_last_tweet_sentiment.csv', index=False)

In [68]:
cust_last_tweet = pd.read_csv('./data/processed/processed_last_tweet_sentiment.csv', index_col=None)

In [70]:
spotify = pd.merge(spotify, cust_last_tweet[['case_id', 'sentiment']],
                   left_on='case_id', right_on='case_id',
                   how='left')

In [73]:
with open('./data/processed/tweets_step5_spotify_sentiment.pkl', 'wb')as picklefile:
    pickle.dump(spotify, picklefile)

### Tweets preprocessing

In [75]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re

In [82]:
custom_stopwords = ['http', 'https', 'spotify', 'help', 'hi', 'spotifycares']
my_stop_words = (text.ENGLISH_STOP_WORDS.union(custom_stopwords)
                 .union(string.punctuation))

In [76]:
df_ticket = pd.DataFrame(ticket_agg)
df_ticket.reset_index(inplace=True)

In [77]:
df_solution = pd.DataFrame(solution_agg)
df_solution.reset_index(inplace=True)

In [78]:
df_ticket = pd.merge(df_ticket, cust_last_tweet[['case_id', 'sentiment']], left_on='case_id', right_on='case_id',
                   how='left')

In [79]:
def text_preprocess(text):
    tokens = word_tokenize(text)
    lemmatize = WordNetLemmatizer()
    clean_text = [lemmatize.lemmatize(token.lower().strip(), pos='a') for token in tokens]
    clean_text = [x for x in clean_text if x not in my_stop_words]
    return ' '.join(clean_text)

In [83]:
df_ticket['text'] = df_ticket.text.apply(lambda x:text_preprocess(x))

In [None]:
with open('./data/processed/tweets_step6_ticket_preprocessed.pkl', 'wb')as picklefile:
    pickle.dump(df_ticket, picklefile)

### Build recommendation engines

In [None]:
## Read in files from previous steps
with open('./data/processed/tweets_step6_ticket_preprocessed.pkl', 'rb')as picklefile:
    df_ticket = pickle.load(picklefile)

cust_last_tweet = pd.read_csv('./data/processed/processed_last_tweet_sentiment.csv', index_col=None)

with open('./data/processed/ticket.pkl', 'rb')as picklefile:
    ticket = pickle.dump(picklefile)

with open('./data/processed/solution.pkl', 'rb')as picklefile:
    solution = pickle.load(picklefile)
    
with open('./data/processed/ticket_agg.pkl', 'rb')as picklefile:
    ticket_agg = pickle.dump(picklefile)

with open('./data/processed/solution_agg.pkl', 'rb')as picklefile:
    solution_agg = pickle.load(picklefile)


    

In [161]:
class RecommendationEngine:
    def __init__(self, vectorizer, n_components, reducer, ticket, solution):
        self.vectorizer = vectorizer
        self.n_dim = n_components
        self.reducer = reducer(n_components)
        self.ticket = ticket
        self.solution = solution
        
    def fit(self, df):
        self.vector_data = self.vectorizer.fit_transform(df['text'])
        self.topic_data = self.reducer.fit_transform(self.vector_data)
        self.df = df_ticket
        return self.topic_data
    
    def _print_recommendations(self, article, rec_list):
        p = inflect.engine()
        print('NEW MESSAGE:', article)
        i = 1
        for resp in self.df.loc[rec_list]['case_id']:
            print(f'\n---{p.ordinal(i)} SIMILAR CONVERSATION---\n')
            self._display_conversation(resp)
            print('\n')
            i += 1
    
    def _display_conversation (self, case_id):
        cust_conv = self.ticket[self.ticket['case_id'] == case_id][['created_at', 'inbound', 'text']]
        agent_conv = self.solution[self.solution['case_id'] == case_id][['created_at', 'inbound', 'text']]
        conv_list = pd.concat([cust_conv, agent_conv]).sort_values('created_at')
        for i in range(conv_list.shape[0]):
            if (conv_list.iloc[i]['inbound']):
                print ('CUSTOMER:   ', colored(conv_list.iloc[i]['text'], 'red'))
            else:
                print ('AGENT:      ', colored(conv_list.iloc[i]['text'], 'green'))
        
    def recommend(self, article, num_to_return):
        article_vectorized = self.vectorizer.transform([article])
        article_topic_vector = self.reducer.transform(article_vectorized)
        nn = NearestNeighbors(n_neighbors=num_to_return, metric='cosine', algorithm='brute')
        nn.fit(self.topic_data[self.resolved_mask])
        results = nn.kneighbors(article_topic_vector)
        #result_texts = [self.texts[i] for i in results[1][0]]
        rec_list = results[1][0]
        print ('number of results: ', len(rec_list))
        self._print_recommendations(article, rec_list)
        
        return rec_list

In [85]:
ticket_origin = pd.merge(ticket_reserve[['tweet_id', 'created_at', 'inbound', 'text']], ticket[['tweet_id', 'case_id']], left_on='tweet_id', right_on='tweet_id', how='right')
solution_origin = pd.merge(solution_reserve[['tweet_id', 'created_at', 'inbound', 'text']], solution[['tweet_id', 'case_id']], left_on='tweet_id', right_on='tweet_id', how='right')

In [86]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words=my_stop_words, 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)

count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)


#tfidf_data = tfidf_vectorizer.fit_transform(list(df_ticket.text))

In [138]:
resolved_ticket = df_ticket[df_ticket['sentiment'] == 'POS']

In [162]:
engine_1 = RecommendationEngine(tfidf_vectorizer, n_components = 300, reducer = TruncatedSVD, ticket = ticket_origin, solution=solution_origin)
topic_data = engine_1.fit(df_ticket)

In [163]:
topic_data.shape

(28787, 300)

In [187]:
engine_2 = RecommendationEngine(count_vectorizer, n_components = 20, reducer = NMF, ticket = ticket_origin, solution=solution_origin)
engine_2.fit(resolved_ticket)

array([[1.94180905e-03, 0.00000000e+00, 1.57093790e-01, ...,
        1.32147808e-02, 0.00000000e+00, 0.00000000e+00],
       [7.29738585e-02, 0.00000000e+00, 5.14639909e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.39817108e-04, 0.00000000e+00, 1.55882332e-01, ...,
        0.00000000e+00, 1.71634402e-03, 0.00000000e+00],
       ...,
       [1.33665449e-04, 1.03610178e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.89677209e-02, 0.00000000e+00],
       [1.33434110e-01, 1.92435947e-03, 5.68934551e-04, ...,
        1.55217317e-02, 2.01762020e-03, 0.00000000e+00]])

In [188]:
engine_3 = RecommendationEngine(tfidf_vectorizer, n_components = 100, reducer = NMF, ticket = ticket_origin, solution=solution_origin)
engine_3.fit(resolved_ticket)

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.08495029e-07, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.75172016e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.51387386e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.65453942e-03],
       [0.00000000e+00, 0.00000000e+00, 1.75215421e-03, ...,
        0.00000000e+00, 0.00000000e+00, 9.09550852e-03]])

In [191]:
engine_dict = {1:engine_1, 2:engine_2, 3:engine_3}

In [192]:
with open('./data/processed/engines.pkl', 'wb') as picklefile:
    pickle.dump(engine_dict, picklefile)

In [142]:
cust_first_tweet = ticket.groupby('case_id')[['text']].first().reset_index()
cust_first_tweet.text[np.random.choice(cust_first_tweet.shape[0])]

In [152]:
engine_1.recommend(ticket.text[2], 3)

number of results:  3
NEW MESSAGE: cool cool cool i think i am finally ready to switch to apple music

---1st SIMILAR CONVERSATION---

CUSTOMER:    [31m@SpotifyCares Uh why is there a limit on how many songs I can "save" — how am I supposed to keep track of the albums I like? https://t.co/bk7Y7Jt93A[0m
AGENT:       [32m@1084 Hey Nick! There's currently a 10k limit in Your Music (Songs + Albums + Artists). More info here: https://t.co/59ABtm5Ftl /MO[0m
CUSTOMER:    [31m@SpotifyCares cool cool cool I think I'm finally ready to switch to Apple Music[0m
AGENT:       [32m@1084 Sorry to hear you feel that way. For now, you can vote for the idea here: https://t.co/8OOl8B2i69 and get support from other users /MO[0m



---2nd SIMILAR CONVERSATION---

CUSTOMER:    [31m@115888 You're cool, Spotify.[0m
AGENT:       [32m@641342 Hey Alice! Thanks. You're not wrong, things sure are cooling down around the office. Brr... ⛄ https://t.co/uvDmDXjQFv[0m



---3rd SIMILAR CONVERSATION---

CUSTO

array([    0, 21155, 24581])

In [157]:
engine_2.recommend(ticket.text[2], 3)

number of results:  3
NEW MESSAGE: cool cool cool i think i am finally ready to switch to apple music

---1st SIMILAR CONVERSATION---

CUSTOMER:    [31m@115888 how dare you erase Julion Alvarez?![0m
AGENT:       [32m@382004 Hey Selene. Thanks for reaching out! There's some info on Spotify content here: https://t.co/0i8GpimuDa /MG[0m
CUSTOMER:    [31m@SpotifyCares But he’s extremely popular and talented... you need to get him back ASAP[0m
AGENT:       [32m@382004 We do our best! But sometimes agreements are ongoing, or can't be reached with the content provider. Hopefully we'll have him back soon /RV[0m



---2nd SIMILAR CONVERSATION---

CUSTOMER:    [31m@115888 ok now its doing it to several albums by different artists including THIRD EYE BLIND's SELF TITLED CMON[0m
AGENT:       [32m@329305 1: Hey, that's not cool. Can you send us the Song Links of the tracks affected? Just tap the three dots &gt; Share &gt; Copy Link.[0m
AGENT:       [32m@329305 2: Also, what country is y

array([10136,  8215,  4807])

In [160]:
engine_3.recommend(ticket.text[2], 3)

number of results:  3
NEW MESSAGE: cool cool cool i think i am finally ready to switch to apple music

---1st SIMILAR CONVERSATION---

CUSTOMER:    [31m@SpotifyCares Uh why is there a limit on how many songs I can "save" — how am I supposed to keep track of the albums I like? https://t.co/bk7Y7Jt93A[0m
AGENT:       [32m@1084 Hey Nick! There's currently a 10k limit in Your Music (Songs + Albums + Artists). More info here: https://t.co/59ABtm5Ftl /MO[0m
CUSTOMER:    [31m@SpotifyCares cool cool cool I think I'm finally ready to switch to Apple Music[0m
AGENT:       [32m@1084 Sorry to hear you feel that way. For now, you can vote for the idea here: https://t.co/8OOl8B2i69 and get support from other users /MO[0m



---2nd SIMILAR CONVERSATION---

CUSTOMER:    [31m@115888 is there a way to remove an block devices from accessing my account because someone has connected there device to my account[0m
AGENT:       [32m@346030 Hey there! We’re sorry to hear that. Check out https://t.co

array([    0,  8760, 10224])

In [164]:
import gensim
google_vec_file = '~/Downloads/GoogleNews-vectors-negative300.bin'
google_model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

2018-08-21 15:27:24,512 : INFO : loading projection weights from ~/Downloads/GoogleNews-vectors-negative300.bin
2018-08-21 15:28:36,466 : INFO : loaded (3000000, 300) matrix from ~/Downloads/GoogleNews-vectors-negative300.bin


In [169]:
# Function to take a document as a list of words and return the document vector
def get_doc_vec(words, model):
    good_words = []
    for word in words:
        # Words not in the original model will fail
        try:
            if model.wv[word] is not None:
                good_words.append(word)
        except:
            continue
    # If no words are in the original model
    if len(good_words) == 0:
        return None
    # Return the mean of the vectors for all the good words
    return model.wv[good_words].mean(axis=0)

In [178]:
ticket_google_vecs = resolved_ticket.text.apply(lambda x: get_doc_vec(x.split(), google_model))

  import sys
  from ipykernel import kernelapp as app


In [183]:
google_model['the fat cat drank milk'.split()].shape

(5, 300)