**Project plan**

*Data preprocessing
1. Detect the english tweets
https://github.com/rfk/pyenchant

*Sentiment Analysis*
1. Find a sentiment analysis NN to rate last tweets
2. DONE - Extract the emoji's and incorporate the emoji sentiment <br>
https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors<br>
https://stackoverflow.com/questions/48340622/extract-all-emojis-from-string-and-ignore-fitzpatrick-modifiers-skin-tones-etc

*Build a pipeline for similar tweet recommender*
1. Implement class for recommend engine
2. Build a pipeline for word embedding, vectorizing, and recommendation
3. Try out Glove, Google embedding.. etc

In [409]:
import string
import emoji
import regex
import pickle
import re
import enchant
import math

import numpy as np
import pandas as pd
import vaderSentiment

import matplotlib.pyplot as plt

from itertools import compress
from tqdm import tqdm_notebook
from termcolor import colored


# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF, TruncatedSVD
import nltk.sentiment.vader
from sklearn.neighbors import NearestNeighbors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
    
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
# logging for gensim (set to INFO)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

%matplotlib inline

In [118]:
twitter = pd.read_csv("./customer-support-on-twitter/twcs.csv")

### Exclude tweets with non-English characters

In [119]:
def fixContractions(text):
    with open('./data/helper/contraction_list.pkl', 'rb') as picklefile:
        cList = pickle.load(picklefile)
    for word in text.split():
        if word.lower() in cList:
            text = text.replace(word, cList[word.lower()])
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https:\/\/t.co\/\w{10}', '', text) # Remove URL link
    text = re.sub(r'@\w+', '', text) # Remove @account
    text = fixContractions(text) # Expand contractions
    return text

In [120]:
twitter['text'] = twitter['text'].apply(lambda x:clean_text(x))

In [99]:
twitter.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,i understand. i would like to assist you. we ...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,i have sent several private messages and no o...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,please send us a private message so that we c...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,i did.,4.0,6.0


In [121]:
def get_emoji_free_text(text):
    text = text.replace(u'\u200d', '')
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    if (len(emoji_list) == 0):
        return np.NaN, clean_text
    else:
        return emoji_list, clean_text

In [122]:
twitter[['emoji','text']] = twitter['text'].apply(lambda x:pd.Series(get_emoji_free_text(x)))

In [102]:
twitter.iloc[100:200,:]

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,emoji
100,180,115743,True,Tue Oct 31 08:17:37 +0000 2017,"so, what is the november ps plus free game?",178181,,
101,182,AskPlayStation,False,Tue Oct 31 22:09:50 +0000 2017,"glad to know that is downloading, please feel ...",,183.0,
102,183,115745,True,Tue Oct 31 22:06:44 +0000 2017,it was when i would go to download it. i just ...,182,184.0,
103,184,AskPlayStation,False,Tue Oct 31 22:05:06 +0000 2017,sorry for the inconvenience. do you see the da...,183,185.0,
104,185,115745,True,Tue Oct 31 08:14:24 +0000 2017,i bought lego star wars in the playstation sto...,184186,,
105,187,AskPlayStation,False,Tue Oct 31 22:06:34 +0000 2017,glad to help. please share the details of this...,,188.0,
106,188,115746,True,Tue Oct 31 07:58:24 +0000 2017,can i block a community from sending me invite...,187189,,
107,190,marksandspencer,False,Tue Oct 31 21:54:47 +0000 2017,"glad you had fun, sarah! we heard count collin...",,191.0,
108,191,115747,True,Tue Oct 31 18:29:42 +0000 2017,we had to have a count colin xx...,190,,[💛]
109,192,marksandspencer,False,Tue Oct 31 21:48:52 +0000 2017,"we have let our team know we can do better, jo...",,193.0,


In [110]:
test = twitter.loc[110, 'emoji']

In [111]:
type(test) == float

False

In [123]:
with open('./data/processed/tweets_step1_emoji_extracted.pkl', 'wb')as picklefile:
    pickle.dump(twitter, picklefile)

In [124]:
with open('./data/processed/tweets_step1_emoji_extracted.pkl', 'rb')as picklefile:
    twitter = pickle.load(picklefile)
twitter.head(100)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,emoji
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,i understand. i would like to assist you. we w...,2,3.0,
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,and how do you propose we do that,,1.0,
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,i have sent several private messages and no on...,1,4.0,
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,please send us a private message so that we ca...,3,5.0,
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,i did.,4,6.0,
5,6,sprintcare,False,Tue Oct 31 21:46:24 +0000 2017,"can you please send us a private message, so t...",57,8.0,
6,8,115712,True,Tue Oct 31 21:45:10 +0000 2017,is the worst customer service,9610,,
7,11,sprintcare,False,Tue Oct 31 22:10:35 +0000 2017,this is saddening to hear. please shoot us a d...,,12.0,
8,12,115713,True,Tue Oct 31 22:04:47 +0000 2017,you gonna magically change your connectivity f...,111314,15.0,"[🤥, 💯]"
9,15,sprintcare,False,Tue Oct 31 20:03:31 +0000 2017,we understand your concerns and we had like fo...,12,16.0,


### Retrieve sentiment score for emoji's
ref:http://kt.ijs.si/data/Emoji_sentiment_ranking/

In [125]:
def build_dict(emoji_table, emoji_sentiment_dict):
    type_set = set()
    for i in range(emoji_table.shape[0]):
        code = emoji_table.loc[i, 'Python Code']
        try:
            key = chr(int(code[1:], 16))
            emoji_sentiment_dict[key] = emoji_table.loc[i, 'Sentiment score']
        except:
            type_set.add(emoji_table.loc[i, 'Unicode block'])
    return emoji_sentiment_dict

In [126]:
emoji_sentiment = pd.read_csv('./data/raw/emoji_nodingbat.csv')
emoji_other = pd.read_csv('./data/raw/emoji_other.csv')
emoji_sentiment_dict = {}

emoji_sentiment_dict = build_dict(emoji_sentiment, emoji_sentiment_dict)
emoji_sentiment_dict = build_dict(emoji_other, emoji_sentiment_dict)

In [127]:
def get_emoji_sentiment(emoji_list):
    score = 0
    if (type(emoji_list) != float):
        for emoji_item in emoji_list:
            if (emoji_item in emoji_sentiment_dict.keys()):
                score += emoji_sentiment_dict[emoji_item]
        if (score == 0):
            return np.NaN
        else:
            return score/len(emoji_list)
    else:
        return np.NaN

In [169]:
#twitter['emoji_score'] = twitter['emoji'].apply(lambda x:get_emoji_sentiment(x))
print ('Number of tweets with emoji score: ', twitter[~twitter['emoji_score'].isnull()].shape[0])
print ('Number of tweets with emoji but not score: ', twitter[twitter['emoji_score'].isnull() & (~twitter['emoji'].isnull())].shape[0])

Number of tweets with emoji score:  161402
Number of tweets with emoji but not score:  33223


In [129]:
with open('./data/processed/tweets_step2_emoji_sentiment.pkl', 'wb')as picklefile:
    pickle.dump(twitter, picklefile)

In [167]:
with open('./data/processed/tweets_step2_emoji_sentiment.pkl', 'rb')as picklefile:
    twitter = pickle.load(picklefile)

### Remove non-English tweets

In [170]:
# Retrieve elements that are printable in English context - This is a 
# combination of digits, letters, punctuation, and whitespace.
valid_content = set(string.printable)

# Include some common non-English punctuations
non_English_punctuation = ['‘', '’', 'é', '–','—','“','”','、','。','`','️', '£', '…', '$']

# Keep tweets with common non-English punctuations
[valid_content.add(x) for x in (non_English_punctuation)]
    
def isEnglish(tweet):
    invalid_index = [x not in valid_content for x in tweet]
    if (np.sum(invalid_index)==0):
        return True
    else:
        return False

## Removed 78k tweets with non-English characters
english_tweets = twitter.text.apply(lambda x:isEnglish(x))
tweets = twitter.loc[english_tweets, :]

In [171]:
# A quick tool to return non-English words
def non_English(words):
    print (list(compress(list(words), [x not in valid_content for x in words])))

### Format the dates

In [173]:
# Format the dates
tweets['created_at'] = pd.to_datetime(tweets['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [174]:
tweets['in_response_to_tweet_id'] = tweets['in_response_to_tweet_id'].fillna(-1)
tweets['in_response_to_tweet_id'] = tweets['in_response_to_tweet_id'].astype(int)
tweets['response_tweet_id'] = tweets['response_tweet_id'].fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [175]:
with open('./data/processed/tweets_step3_english.pkl', 'wb')as picklefile:
    pickle.dump(tweets, picklefile)

In [176]:
with open('./data/processed/tweets_step3_english.pkl', 'rb')as picklefile:
    tweets = pickle.load(picklefile)

### Add custom_id and brand_id columns

In [201]:
tweets.shape

(2710745, 10)

In [198]:
# Add custom_id column
tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),left_on='in_response_to_tweet_id', right_on='tweet_id', how='left')
tweets.loc[tweets['inbound'] == True, 'author_id_y'] = tweets.loc[tweets['inbound'] == True, 'author_id_x']
tweets['author_id_y'] = tweets['author_id_y'].fillna(-1)
tweets.drop('tweet_id_y', axis=1, inplace=True)
tweets.rename(columns={'author_id_y':'cust_id', 
                       'author_id_x':'author_id',
                      'tweet_id_x':'tweet_id'}, inplace=True)

# Drop the tweets without customer id, those are usually promotional tweets initiated by
# brands agent or customers not in the datase: Number of such tweets: 71255
tweets = tweets[~tweets['cust_id'].isnull()]

In [227]:
# Add brand_id column

# Carve out the first tweet id when there are multiples in response_tweet_id
tweets['processed_response_id'] = tweets['response_tweet_id'].apply(
    lambda x: x if (isinstance(x,int)) else x.split(',')[0])
tweets['processed_response_id'] = tweets['processed_response_id'].astype(int)

# Retrieve the author id of the responde tweet
tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),
                  left_on='processed_response_id', 
                  right_on='tweet_id', 
                  how='left')

tweets.drop('tweet_id_y', axis=1, inplace=True)
tweets.rename(columns={'author_id_y':'brand_id', 
                       'author_id_x':'author_id',
                      'tweet_id_x':'tweet_id'}, inplace=True)

tweets = pd.merge(tweets,pd.DataFrame(tweets[['tweet_id', 'author_id']]),left_on='in_response_to_tweet_id', right_on='tweet_id', how='left')
tweets.loc[(tweets['inbound'] & (tweets['processed_response_id'] == -1)), 'brand_id'] = (
    tweets.loc[(tweets['inbound'] & (tweets['processed_response_id'] == -1)), 'author_id_y'])
tweets.rename(columns={'tweet_id_x':'tweet_id', 
                       'author_id_x':'author_id'}, inplace=True)

tweets.loc[tweets['inbound'] == False, 'brand_id'] = tweets.loc[tweets['inbound'] == False, 'author_id']
tweets.drop(['processed_response_id', 'tweet_id_y', 'author_id_y'], axis=1, inplace=True)

# could use some more logic to extract the account name in tweet text

In [233]:
# Tweets with no response is 3880
print (tweets[tweets['brand_id'].isnull() & (tweets['response_tweet_id'] == -1)].shape)

# Tweets with response outside the dataset is 42764
print (tweets[tweets['brand_id'].isnull() & (tweets['response_tweet_id'] != -1)].shape)

# Drop the tweets without brand_id
tweets = tweets[~tweets['brand_id'].isnull()]

In [238]:
with open('./data/processed/tweets_step4_brand_cust.pkl', 'wb')as picklefile:
    pickle.dump(tweets, picklefile)

In [4]:
with open('./data/processed/tweets_step4_brand_cust.pkl', 'rb')as picklefile:
    tweets = pickle.load(picklefile)

In [5]:
tweets.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,emoji,emoji_score,cust_id,brand_id
0,1,sprintcare,False,2017-10-31 22:10:47,i understand. i would like to assist you. we w...,2,3,,,115712,sprintcare
1,2,115712,True,2017-10-31 22:11:45,and how do you propose we do that,-1,1,,,115712,sprintcare
2,3,115712,True,2017-10-31 22:08:27,i have sent several private messages and no on...,1,4,,,115712,sprintcare
3,4,sprintcare,False,2017-10-31 21:54:49,please send us a private message so that we ca...,3,5,,,115712,sprintcare
4,5,115712,True,2017-10-31 21:49:35,i did.,4,6,,,115712,sprintcare


In [6]:
# Identify which tweets are in the same conversation (customer case)
tweets.sort_values(['cust_id', 'created_at'], inplace=True)
tweets.reset_index(inplace=True)

# Calculate the time difference (in seconds) bwetween a tweet and its previous tweet
tweets['time_diff'] = tweets['created_at'].diff()
tweets['time_diff'] = tweets['time_diff'].apply(lambda x:x.total_seconds())
tweets['time_diff'] = tweets['time_diff'].apply(lambda x: 0 if x <0 else x)
tweets['time_diff'] = tweets['time_diff'].fillna(0)

tweets['case_id'] = 0
tweets['time_diff'] = tweets['time_diff'].fillna(0) 

### Carve out the spotify support data

In [7]:
def analyze_brand(tweets, brand_cc_name):
    ## Subset tweets according to brand
    brand = tweets[tweets['brand_id'] == brand_cc_name]
    brand.reset_index(inplace=True)
    
    ## Assign unique caseid for each conversation (interchange of tweets)
    ## To avoid grouping different cases raised by the same customer at different time
    ## We set the max threshold between subsequent tweets in a conversation as 2 days
    caseid = 1
    brand.loc[0, 'case_id'] = caseid
    for row in range(1, brand.shape[0]):
        if (row%1000 == 0):
            print (row)
        if ((brand.loc[row, 'cust_id'] != brand.loc[row-1, 'cust_id']) |
            (brand.loc[row, 'time_diff'] > 3600*24*2)):
            caseid = caseid + 1
            brand.loc[row, 'time_diff'] = 0 # Set time between conversations as 0
            brand.loc[row, 'case_id'] = caseid
        else:
            brand.loc[row, 'case_id'] = caseid
    return brand

In [8]:
spotify = analyze_brand(tweets, 'SpotifyCares')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000


In [9]:
with open('./data/processed/spotify.pkl', 'wb')as picklefile:
    pickle.dump(spotify, picklefile)

In [235]:
with open('./data/processed/spotify.pkl', 'rb')as picklefile:
    spotify = pickle.load(picklefile)

### Remove ticket without solution and solution without ticket

In [22]:
conv_count = spotify.groupby('case_id')['case_id'].count()
single_conv_index = conv_count[conv_count==1].index.tolist()
spotify['single_conv'] = spotify['case_id'].apply(
    lambda x: True if x in single_conv_index else False)

# Remove the conversations with only one tweet (424 ) 
spotify = spotify[~spotify['single_conv']]
spotify.drop('single_conv', axis=1, inplace=True)

spotify.drop(['level_0', 'index'], axis=1, inplace=True)

In [23]:
spotify.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,emoji,emoji_score,cust_id,brand_id,time_diff,case_id
0,2747480,1084,True,2017-11-20 22:40:16,uh why is there a limit on how many songs i ca...,2747478,-1,,,1084,SpotifyCares,0.0,1
1,2747478,SpotifyCares,False,2017-11-20 22:46:58,hey nick! there is currently a 10k limit in yo...,2747479,2747480,,,1084,SpotifyCares,402.0,1
2,2747479,1084,True,2017-11-20 22:55:34,cool cool cool i think i am finally ready to s...,2747481,2747478,,,1084,SpotifyCares,516.0,1
3,2747481,SpotifyCares,False,2017-11-20 23:12:33,"sorry to hear you feel that way. for now, you ...",-1,2747479,,,1084,SpotifyCares,1019.0,1
4,2606352,11543,True,2017-10-30 19:02:36,there used to be a useful feature in ios app t...,2606350,-1,,,11543,SpotifyCares,0.0,2


In [52]:
ticket = spotify[spotify['inbound'] == True]
solution = spotify[spotify['inbound'] == False]

In [58]:
# Remove ticket without solution or solution without ticket 
# conversations with tweets from only customer or customer agents

ticket = ticket.apply(
    lambda x: x if (x[12] in solution['case_id'].unique()) else None, axis = 1)
solution = solution.apply(
    lambda x: x if (x[12] in ticket['case_id'].unique()) else None, axis = 1)

In [59]:
#ticket.text.fillna('', inplace=True)
#solution.text.fillna('', inplace=True)
ticket_agg = ticket.groupby('case_id')['text'].agg(lambda x:' '.join(x))
solution_agg = solution.groupby('case_id')['text'].agg(lambda x:' '.join(x))

In [None]:
spotify = spotify.apply(
    lambda x: x if (x[10] in ticket['case_id'].unique()) else None, axis=1)

In [47]:
ticket['case_id'].unique().shape

(28788,)

### Perform very basic sentiment analysis on last tweet from customer

In [49]:
cust_last_tweet = ticket.groupby('case_id')['text'].last().reset_index()

In [104]:
cust_last_tweet = ticket.groupby('case_id')[['text','emoji', 'emoji_score']].last().reset_index()

In [105]:
vdanalyzer = vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer()
cust_last_tweet['vader_sentiment'] = cust_last_tweet['text'].apply(
        lambda x:vdanalyzer.polarity_scores(x)['compound'])

In [106]:
sentiment_analyzer = nltk.sentiment.vader.SentimentIntensityAnalyzer()
cust_last_tweet['ntlk_sentiment'] = cust_last_tweet['text'].apply(
        lambda x:sentiment_analyzer.polarity_scores(x)['compound'])

In [123]:
cust_last_tweet.to_csv('./data/processed/processed_last_tweet_sentiment.csv')

In [108]:
pd.set_option('display.max_colwidth', -1)
print (ticket[ticket['tweet_id'] == 2606351.0].text.astype(str))

6    taking features away will only make it easier for users to go. i’ve been a premium user for 4+ years... don’t let me leave.
Name: text, dtype: object


In [259]:
cust_last_tweet['sentiment'] = cust_last_tweet.apply(lambda
                        x:get_overall_sentiment_score(x), axis=1)

In [181]:
cust_last_tweet[['emoji_score', 'ntlk_sentiment']] = cust_last_tweet[['emoji_score', 'ntlk_sentiment']].astype(float)

In [206]:
def get_overall_sentiment_score(row):
    sentiment = ''
    if (not math.isnan(row[3])):
        if (row[3] >= 0.2):
            sentiment = 'POS'
        elif ((row[3] < 0.2) & (row[3] > -0.05)):
            sentiment = 'NEU'
        elif (row[3] <= -0.05):
            sentiment = 'NEG'
    elif (not math.isnan(row[5])):
        if (row[5] >= 0.2):
            sentiment = 'POS'
        elif ((row[5] < 0.2) & (row[5] > -0.05)):
            sentiment = 'NEU'
        elif (row[5] <= -0.05):
            sentiment = 'NEG'
    return sentiment

In [222]:
spotify.rename(columns ={'sentiment':'sentiment'}, inplace=True)

In [261]:
spotify = pd.merge(spotify, cust_last_tweet[['case_id', 'sentiment']],
                   left_on='case_id', right_on='case_id',
                   how='left')

In [263]:
spotify.drop('overall_sentiment_score', axis=1, inplace=True)

In [229]:
with open('./data/processed/tweets_step5_spotify_sentiment.pkl', 'wb')as picklefile:
    pickle.dump(spotify, picklefile)

In [230]:
with open('./data/processed/tweets_step5_spotify_sentiment.pkl', 'rb')as picklefile:
    spotify = pickle.load(picklefile)

### Tweets preprocessing

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re

In [266]:
df_ticket = pd.DataFrame(ticket_agg)
df_ticket.reset_index(inplace=True)

In [327]:
df_solution = pd.DataFrame(solution_agg)
df_solution.reset_index(inplace=True)

In [272]:
df_ticket = pd.merge(df_ticket, cust_last_tweet[['case_id', 'sentiment']], left_on='case_id', right_on='case_id',
                   how='left')

In [281]:
def text_preprocess(text):
    tokens = word_tokenize(text)
    lemmatize = WordNetLemmatizer()
    clean_text = [lemmatize.lemmatize(token.lower().strip(), pos='a') for token in tokens]
    clean_text = [x for x in clean_text if x not in my_stop_words]
    return ' '.join(clean_text)

In [282]:
df_ticket['text'] = df_ticket.text.apply(lambda x:text_preprocess(x))

### Perform word embedding 

In [284]:
custom_stopwords = ['http', 'https', 'spotify', 'help', 'hi', 'spotifycares']
my_stop_words = (text.ENGLISH_STOP_WORDS.union(custom_stopwords)
                 .union(string.punctuation))

In [286]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words=my_stop_words, 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)
tfidf_data = tfidf_vectorizer.fit_transform(list(df_ticket.text))

In [None]:
class RecommendationEngine:
    def __init__(self, vectorizer, n_components, reducer):
        self.vectorizer = vectorizer
        self.n_dim = n_components
        self.reducer = reducer(n_components)
        
    def fit(self, text):
        self.vector_data = self.vectorizer.fit_transform(text)
        self.topic_data = self.reducer.fit_transform(self.vector_data)
        self.texts = text
        
    def recommend(self, article, num_to_return):
        article_vectorized = self.vectorizer.transform([article])
        article_topic_vector = self.reducer.transform(article_vectorized)
        nn = NearestNeighbors(n_neighbors=num_to_return, metric='cosine', algorithm='brute')
        nn.fit(self.topic_data)
        results = nn.kneighbors(article_topic_vector)
        
        result_texts = [self.texts[i] for i in results[1][0]]
        
        return result_texts

In [288]:
n_comp = 300
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)

In [289]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [290]:
display_topics(lsa_tfidf,tfidf_vectorizer.get_feature_names(),10)


Topic  0
account, premium, family, student, just, need, email, premium account, app, thanks

Topic  1
account, premium, student, premium account, charged, email, family, hulu, facebook, hacked

Topic  2
dm, sent, sent dm, need, check, check dm, thanks, account, email, thank

Topic  3
premium, dm, student, charged, discount, family, sent, student discount, month, sent dm

Topic  4
app, iphone, update, student, charged, dm, app iphone, update app, discount, student discount

Topic  5
need, reputation, student, charged, thank, discount, album, student discount, available, hulu

Topic  6
need, family, iphone, app, update, premium, plan, family plan, need account, app iphone

Topic  7
reputation, iphone, thank, update, app, available, reputation available, app iphone, update app, family

Topic  8
thank, need, family, thank thank, dm thank, iphone, plan, account thank, family plan, thank guys

Topic  9
thanks, student, family, plan, email, discount, family plan, student discount, charged, u

username, wrong, sign, android, care, access, trying, paid, code, know

Topic  98
use, come, username, sign, android, payment, discount, getting, im, artist

Topic  99
username, waiting, im, come, using, payment, change username, isn, sign, getting

Topic  100
waiting, paid, come, paid premium, reply, playing, fuck, contact, sign, use

Topic  101
come, isn, address, contact, email address, isn working, trying, discount, year, using

Topic  102
username, use, tried, avail, feature, ve, wan, wan na, come, student

Topic  103
waiting, discount, contact, premium account, student discount, trying, hulu, ios, reply, wtf

Topic  104
address, waiting, wan na, wan, email address, contact, username, try, money, bring

Topic  105
address, waiting, im, phone, sign, albums, use, email address, feature, paid

Topic  106
isn, isn working, bring, did, using, trying, ve, android, know, waiting

Topic  107
feature, wan na, wan, keeps, preview, download, ios, shuffle, touch, touch preview

Topic  108
con

pesos, info, tell, fb, right, old, years, bad, number, day

Topic  191
happened, number, won, accounts, thing, great, phone number, offer, okay, mobile

Topic  192
thing, bad, okay, remove, people, search, years, sort, ago, week

Topic  193
thing, annoying, number, think, anymore, won, search, mobile, yes, fb

Topic  194
anymore, okay, let, thing, website, late, yes, factor, important like, supporttwofactorauth

Topic  195
late, member, message, family member, cool, old, used, send, card, eligible

Topic  196
happened, important, factor, security, factor auth, supporttwofactorauth, security important, important like, supported factor, auth supporttwofactorauth

Topic  197
info, late, security, factor, supporttwofactorauth, auth supporttwofactorauth, like supported, security important, important like, supported factor

Topic  198
tell, mobile, updated, dear, deleted, try, africa, bad, south, south africa

Topic  199
limit, possible, people, years, deal, ago, sure, great, credit, rap

To

removed, thx, devices, uk, wait, happen, upgraded, verified, response, open

Topic  279
look, used, dont, open, fucking, instead, premium subscription, lost, eligible, linked

Topic  280
paypal, messaged, idea, promo, removed, ll, wifi, happens, track, profile

Topic  281
instead, charging, log account, code, wifi, galaxy, student premium, removed, real, information

Topic  282
real, taken, uk, price, wait, response, streaming, paypal, linked, happen

Topic  283
real, ur, customers, upgraded, set, linked, information, tho, looks, customer

Topic  284
oh, real, removed, messaged, fucking, premium family, information, pretty, podcasts, online

Topic  285
dont, thx, removed, years, taken, fucking, ll, monthly, response, time capsule

Topic  286
lol, follow, isnt, pretty, actually, used, ll, cheers, looks, loading

Topic  287
messaged, taken, mixtapes, queue, computer, charging, customers, lost, dont, used

Topic  288
cc, loading, customers, happens, unable, account hacked, pretty, hear, p

In [376]:
def get_recommendations(first_article, model, vectorizer, training_vectors):
    '''
    first_article: (string) An article that we want to use to find similar articles
    model: (a fit dimensionality reducer) Projects vectorized words onto a subspace 
           (uses NMF or SVD/LSA typically)
    vectorizer: Vectorizes first_article 
    training_vectors: (numpy array shape) a (num_docs in training) x (NMF/SVD/LSA) dimensional array.
                      Used to train NearestNeighbors model
    '''
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    nn = NearestNeighbors(n_neighbors=3, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    results = nn.kneighbors(new_vec)
    return results[1][0] # 1st row is the score

In [401]:
ticket.head()

Unnamed: 0_level_0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,emoji,emoji_score,cust_id,brand_id,time_diff,case_id
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,2747480.0,1084,True,2017-11-20 22:40:16,"uh why is there a limit on how many songs i can ""save"" — how am i supposed to keep track of the albums i like?",2747478,-1.0,,,1084,SpotifyCares,0.0,1.0
1.0,2747479.0,1084,True,2017-11-20 22:55:34,cool cool cool i think i am finally ready to switch to apple music,2747481,2747478.0,,,1084,SpotifyCares,516.0,1.0
2.0,2606352.0,11543,True,2017-10-30 19:02:36,there used to be a useful feature in ios app that allowed user to hear sample of song by touch+hold on song. gone?,2606350,-1.0,,,11543,SpotifyCares,0.0,2.0
2.0,2606351.0,11543,True,2017-10-30 21:34:32,taking features away will only make it easier for users to go. i’ve been a premium user for 4+ years... don’t let me leave.,2606353,2606350.0,,,11543,SpotifyCares,2293.0,2.0
3.0,855.0,115887,True,2017-10-31 19:10:51,i’m pissed my shuffle and repeat button just don’t fucking work and i’m getting frustrated,854,-1.0,,,115887,SpotifyCares,0.0,3.0


In [406]:
df_ticket.text[0]

"uh limit songs `` save '' — supposed track albums like cool cool cool think finally ready switch apple music"

In [407]:
import inflect
p = inflect.engine()

def print_recommendations(first_tweet, recommend_list):
    print('NEW MESSAGE:', first_tweet)
    i = 1
    for resp in df_ticket.loc[rec_list]['case_id']:
        print(f'\n---{p.ordinal(i)} SIMILAR CONVERSATION---\n')
        display_conversation(resp)
        print('\n')
        i += 1

In [408]:
rec_list = get_recommendations(df_ticket.text[0], lsa_tfidf, tfidf_vectorizer, lsa_tfidf_data)
print_recommendations(df_ticket.text[0], rec_list)

NEW MESSAGE: uh limit songs `` save '' — supposed track albums like cool cool cool think finally ready switch apple music

---1st SIMILAR CONVERSATION---

CUSTOMER:    [31muh why is there a limit on how many songs i can "save" — how am i supposed to keep track of the albums i like?[0m
AGENT:       [32mhey nick! there is currently a 10k limit in your music (songs + albums + artists). more info here: /mo[0m
CUSTOMER:    [31mcool cool cool i think i am finally ready to switch to apple music[0m
AGENT:       [32msorry to hear you feel that way. for now, you can vote for the idea here: and get support from other users /mo[0m



---2nd SIMILAR CONVERSATION---

CUSTOMER:    [31myou are cool, spotify.[0m
AGENT:       [32mhey alice! thanks. you are not wrong, things sure are cooling down around the office. brr...[0m



---3rd SIMILAR CONVERSATION---

CUSTOMER:    [31mthis is cool[0m
AGENT:       [32mthanks for the feedback, sara! it is music to our ears if there is anything we can

In [374]:
def display_conversation (case_id):
    cust_conv = ticket[ticket['case_id'] == case_id][['created_at', 'inbound', 'text']]
    agent_conv = solution[solution['case_id'] == case_id][['created_at', 'inbound', 'text']]
    conv_list = pd.concat([cust_conv, agent_conv]).sort_values('created_at')
    for i in range(conv_list.shape[0]):
        if (conv_list.iloc[i]['inbound']):
            print ('CUSTOMER:   ', colored(conv_list.iloc[i]['text'], 'red'))
        else:
            print ('AGENT:      ', colored(conv_list.iloc[i]['text'], 'green'))