In [6]:
from langdetect import detect
import os
import re
from collections import defaultdict

In [7]:
import pandas as pd

data = defaultdict(lambda: {'topic': '', 'content': '', 'messages': []})

for file_path in os.listdir("cleaned_intercom_messages"):
    csv = pd.read_csv(f"cleaned_intercom_messages/{file_path}")
    csv = csv[csv['clean_body'].notna()]
    bodies = csv["clean_body"]
    ids = csv["key_intercomconversation"]
    for id, body in zip(ids, bodies):
        m = re.search(" Selected category: (.+) Content: (.+) uid:", body)
        if m is not None:
            data[id]['topic'] = m.group(1)
            data[id]['content'] = m.group(2)
        else:
            data[id]['messages'].append(body)

df = pd.DataFrame.from_dict(data, orient="index")
print(len(df[df["topic"] == '']))
print(len(df[df["topic"] != '']))

273417
10


In [8]:
df[df["topic"] != '']['messages'].iloc[0]

["question. I wanted to send out a group email. Just want to make sure; For the respondents, there is no difference for them if I sent the emails individually it's simply sending the same email again and again to a group of people, right?",
 "Pipedrive Support Bot here! Ask me any question about Pipedrive and I'll be happy to assist you 😊 Although I am a bot, I'll do as best as I can to solve your queries. If there's something I don't understand I'll direct you to my team members.",
 'You can send an email to large groups of your leads using our Group Emailing function. It is available in the following places: The List View of the Deals tab The List View of the Contacts tab The List View of the Activities tab The Timeline view of the Contacts tab You can select up to one hundred contacts when sending one email. Once multiple contacts are selected in any of the places listed above, the Send group email button will appear. Once you click this button, you will be prompted to provide the r

In [37]:
test_df = df[:10000]

In [39]:
test_df.head()

Unnamed: 0,topic,content,messages
13672140114,,,"[{{name}} im also talking ot this guy, lll, je..."
13675529033,,,"[Guys, please help the customer when he comes ..."
13675881994,,,[Record a video]
13850823877,,,"[Hello, I'm having trouble connecting Zapier. ..."
13895369856,,,"[View in browser, View in browser]"


In [81]:
import re

In [85]:
"{{name}} is sth".replace("{{.*}}", "")

'{{name}} is sth'

In [84]:
"{{name}} is sth".replace(str(re.match("{{.*}}", "{{name}} is sth")), "")

'{{name}} is sth'

In [93]:
re.sub(r'{{[^}]*}}', '', "{{name}} is {{number}} name").strip(" ")

'is  name'

In [94]:
messages = {}

for index, row in test_df.iterrows():
    msg = re.sub(r'{{[^}]*}}', '', " ".join(row['messages']))
    messages[index] = msg

In [117]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [105]:
messages[13675529033]

'Guys, please help the customer when he comes back'

In [110]:
len(messages[13672140114])

109

In [121]:
#nltk.download('stopwords')
stop_words = stopwords.words('english')

In [139]:
stop_words.extend(["wo", "n't", "'m", "ca", "'ll", "'re", "'ve", "'d"])

In [143]:
#words = word_tokenize(messages[13850823877])
#words = messages[13850823877].split(" ")
lemmatizer = WordNetLemmatizer()

lemmas = []

for key, value in messages.items():
    words = word_tokenize(value)
    lem = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        if len(lemma) > 1:
            lem.append(lemma.lower())
    lem = [w for w in lem if not w in stop_words]
    lemmas.append(lem)
    
#print(lemmas)

In [150]:
lemmas[7]

['need',
 'add',
 'user',
 'pipedrive',
 'account',
 'reaching',
 'pipedrive',
 'support',
 'please',
 'find',
 'information',
 'add',
 'user',
 'checking',
 'see',
 'question',
 'require',
 'help',
 'please',
 'feel',
 'free',
 'reply',
 'open',
 'new',
 'conversation',
 'get',
 'touch',
 'another',
 'support',
 'member',
 'great',
 'day']

In [151]:
from sklearn.feature_extraction.text import CountVectorizer

In [152]:
vectorizer = CountVectorizer()

In [153]:
texts = [" ".join(text) for text in lemmas]

In [158]:
X = vectorizer.fit_transform(texts)

In [163]:
from sklearn.decomposition import LatentDirichletAllocation

In [164]:
lda = LatentDirichletAllocation()

In [165]:
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [173]:
lda.transform(X[:5])

array([[0.00588294, 0.00588363, 0.1753002 , 0.00588639, 0.00588261,
        0.00588392, 0.55635633, 0.22715739, 0.00588237, 0.00588421],
       [0.01429144, 0.01428574, 0.01428746, 0.01429047, 0.01428584,
        0.01428583, 0.87141186, 0.0142899 , 0.01428571, 0.01428575],
       [0.03333527, 0.03333413, 0.03333583, 0.03333419, 0.03333354,
        0.03333423, 0.69998722, 0.03333763, 0.03333333, 0.03333464],
       [0.34859132, 0.00094342, 0.00094344, 0.00094413, 0.00094343,
        0.00094347, 0.51501567, 0.1297883 , 0.00094341, 0.00094342],
       [0.02000186, 0.02000112, 0.02000064, 0.02000008, 0.02000004,
        0.02000004, 0.81999219, 0.0200039 , 0.02      , 0.02000014]])

In [168]:
for i in lemmas[:5]:
    print(i)

['im', 'also', 'talking', 'ot', 'guy', 'lll', 'jejfiabfiabg', 'hm', 'riley', 'young', 'linked', 'se-', 'blank', 'email', 'thread', 'subject']
['guys', 'please', 'help', 'customer', 'come', 'back']
['record', 'video']
['hello', 'trouble', 'connecting', 'zapier', 'drag', 'across', 'email', 'mobile', 'number', 'pipedrive', 'zapier', 'issue', 'time', 'available', 'chat', 'please', 'advise', 'time', 'help', 'desk', 'open', 'australia', 'zapier', 'allow', 'transfer', 'field', 'data', 'first', 'last', 'name', 'phone', 'number', 'email', 'get', 'work', 'terminate', 'pipedrive', 'appears', 'either', 'set', 'person', 'deal', 'reply', 'could', 'find', 'one', 'recent', 'conversation', 'support', 'none', 'related', 'zapier', 'afraid', 'send', 'something', 'proactively', 'without', 'sure', 'leave', 'come', 'back', 'answer', 'automatic', 'work', 'flow', 'template', 'setup', 'please', 'advise', 'change', 'sending', 'email', 'contacting', 'technical', 'question', 'would', 'need', 'check', 'information'

In [172]:
print(lemmas[7])

['need', 'add', 'user', 'pipedrive', 'account', 'reaching', 'pipedrive', 'support', 'please', 'find', 'information', 'add', 'user', 'checking', 'see', 'question', 'require', 'help', 'please', 'feel', 'free', 'reply', 'open', 'new', 'conversation', 'get', 'touch', 'another', 'support', 'member', 'great', 'day']


---

In [1]:
import nltk

In [2]:
#nltk.download('crubadan')
tc = nltk.classify.textcat.TextCat() 

In [4]:
!pip install pycountry

Collecting pycountry
  Downloading https://files.pythonhosted.org/packages/76/73/6f1a412f14f68c273feea29a6ea9b9f1e268177d32e0e69ad6790d306312/pycountry-20.7.3.tar.gz (10.1MB)
Building wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py): started
  Building wheel for pycountry (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Birgit\AppData\Local\pip\Cache\wheels\33\4e\a6\be297e6b83567e537bed9df4a93f8590ec01c1acfbcd405348
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3


In [5]:
import pycountry

In [11]:
naide = df[df["topic"] != '']['messages'].iloc[0]
naide

["question. I wanted to send out a group email. Just want to make sure; For the respondents, there is no difference for them if I sent the emails individually it's simply sending the same email again and again to a group of people, right?",
 "Pipedrive Support Bot here! Ask me any question about Pipedrive and I'll be happy to assist you 😊 Although I am a bot, I'll do as best as I can to solve your queries. If there's something I don't understand I'll direct you to my team members.",
 'You can send an email to large groups of your leads using our Group Emailing function. It is available in the following places: The List View of the Deals tab The List View of the Contacts tab The List View of the Activities tab The Timeline view of the Contacts tab You can select up to one hundred contacts when sending one email. Once multiple contacts are selected in any of the places listed above, the Send group email button will appear. Once you click this button, you will be prompted to provide the r

In [12]:
len(naide)

106

In [21]:
import time
start_time = time.time()

languages = []

for n in naide:
    #print(type(n))
    guess_one = tc.guess_language(n)
    guess_one_name = pycountry.languages.get(alpha_3=guess_one)
    if guess_one_name is None:
        continue
    languages.append(guess_one_name.name)
    
print("--- %s seconds ---" % (time.time() - start_time))

--- 57.480953216552734 seconds ---


In [30]:
start_time2 = time.time()

languages2 = []

for n in naide:
    try:
        lang = detect(n)
        languages2.append(lang)
    except:
        continue
    
print("--- %s seconds ---" % (time.time() - start_time2))

--- 0.600996732711792 seconds ---
