# 1. Intent and language recognition

In [1]:
from utils import *
from dialogue_manager import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/himanshug99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import numpy as np
import pandas as pd
import pickle
import re

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def tfidf_features(X_train, X_test, vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    
    tfidf_vectorizer = TfidfVectorizer(token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,2))
    tfidf_vectorizer.fit(X_train)
    
    X_train = tfidf_vectorizer.transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)
    
    # wb: writing bytes mode
    with open(vectorizer_path,'wb') as f:
      pickle.dump(tfidf_vectorizer,f)
    
    return X_train, X_test

In [4]:
sample_size = 200000

dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\t').sample(sample_size, random_state=0)
stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\t').sample(sample_size, random_state=0)

In [5]:
dialogue_df.head()

Unnamed: 0,text,tag
82925,"Donna, you are a muffin.",dialogue
48774,He was here last night till about two o'clock....,dialogue
55394,"All right, then make an appointment with her s...",dialogue
90806,"Hey, what is this-an interview? We're supposed...",dialogue
107758,Yeah. He's just a friend of mine I was trying ...,dialogue


In [6]:
stackoverflow_df.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,Efficient Algorithm to compose valid expressio...,python
1084095,15747223,Why does this basic thread program fail with C...,c_cpp
1049020,15189594,Link to scroll to top not working,javascript
200466,3273927,Is it possible to implement ping on windows ph...,c#
1200249,17684551,GLSL normal mapping issue,c_cpp


In [7]:
dialogue_df['text'] = dialogue_df['text'].apply(text_prepare)
stackoverflow_df['title'] = stackoverflow_df['title'].apply(text_prepare)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])
y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, RESOURCE_PATH['TFIDF_VECTORIZER'])

Train size = 360000, test size = 40000


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [11]:
intent_recognizer=LogisticRegression(penalty='l2',C=10,random_state=0, max_iter=1000)
intent_recognizer.fit(X_train_tfidf,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.991575


In [13]:
pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

### Programming language classification 

In [14]:
X = stackoverflow_df['title'].values
y = stackoverflow_df['tag'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

Train size = 160000, test size = 40000


In [16]:
vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))

X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [17]:
from sklearn.multiclass import OneVsRestClassifier

In [18]:
tag_classifier=OneVsRestClassifier(LogisticRegression(C=5,penalty='l2',random_state=0, max_iter=10000))
tag_classifier.fit(X_train_tfidf,y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=10000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False),
                    n_jobs=None)

In [19]:
# Check test accuracy.
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.8007


In [20]:
pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

# 2. Ranking  questions with embeddings

In [21]:
def prepare_file(in_, out_):
    out = open(out_, 'w')
    for line in open(in_, encoding='utf8'):
        line = line.strip().split('\t')
        new_line = [text_prepare(q) for q in line]
        print(*new_line, sep='\t', file=out)
    out.close()

In [22]:
prepare_file('./data/train.tsv', 'data/prepared_train.tsv')

In [23]:
setup_starspace()
! ./Starspace/starspace train -trainFile "data/prepared_train.tsv" -model starspace_embedding \
-trainMode 3 -adagrad true -ngrams 1 -epoch 5 -dim 100 -similarity cosine -minCount 2 \
-verbose true -fileFormat labelDoc -negSearchLimit 10 -lr 0.05

Arguments: 
lr: 0.05
dim: 100
epoch: 5
maxTrainTime: 8640000
validationPatience: 10
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
batchSize: 5
thread: 10
minCount: 2
minCountLabel: 1
label: __label__
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
useWeight: 0
weightSep: :
Start to initialize starspace model.
Build dict from input file : data/prepared_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : data/prepared_train.tsv
Total number of examples loaded : 999740
Initialized model weights. Model size :
matrix : 95058 100
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.040000  loss: 0.044157  eta: 0h6m  tot: 0h1m31s  (20.0%)  lr: 0.049940  loss: 0.243255  eta: 0h8m  tot: 0h0m1s  (0.2%)2.3%  lr: 0.049840  loss: 0.180741  eta: 0h8m  tot: 0h0m2s  (0.5%)3.7%  lr: 0.049680  loss: 0.1

Epoch: 100.0%  lr: 0.020000  loss: 0.009264  eta: 0h2m  tot: 0h4m22s  (60.0%)8%  lr: 0.029910  loss: 0.008951  eta: 0h4m  tot: 0h2m59s  (40.2%)%  lr: 0.029820  loss: 0.010916  eta: 0h4m  tot: 0h2m59s  (40.3%)%  lr: 0.029730  loss: 0.010024  eta: 0h4m  tot: 0h3m0s  (40.5%)2.5%  lr: 0.029730  loss: 0.009930  eta: 0h4m  tot: 0h3m0s  (40.5%)5.4%  lr: 0.029399  loss: 0.010095  eta: 0h4m  tot: 0h3m2s  (41.1%)5.6%  lr: 0.029389  loss: 0.010124  eta: 0h4m  tot: 0h3m3s  (41.1%)5.8%  lr: 0.029379  loss: 0.010034  eta: 0h4m  tot: 0h3m3s  (41.2%)%  lr: 0.029339  loss: 0.009881  eta: 0h4m  tot: 0h3m3s  (41.2%)7.4%  lr: 0.029199  loss: 0.009557  eta: 0h4m  tot: 0h3m4s  (41.5%)  lr: 0.029179  loss: 0.009477  eta: 0h4m  tot: 0h3m4s  (41.5%)7.7%  lr: 0.029159  loss: 0.009587  eta: 0h4m  tot: 0h3m5s  (41.5%)7.8%  lr: 0.029149  loss: 0.009547  eta: 0h4m  tot: 0h3m5s  (41.6%)9.4%  lr: 0.028839  loss: 0.009512  eta: 0h4m  tot: 0h3m6s  (41.9%)%  lr: 0.028489  loss: 0.009462  eta: 0h4m  tot: 0h3m8s  (42.4%)1

Epoch: 100.0%  lr: 0.000000  loss: 0.006764  eta: <1min   tot: 0h7m15s  (100.0%) lr: 0.009830  loss: 0.004945  eta: 0h1m  tot: 0h5m52s  (80.4%)3.1%  lr: 0.009700  loss: 0.005417  eta: 0h1m  tot: 0h5m53s  (80.6%)4.9%  lr: 0.009570  loss: 0.005940  eta: 0h1m  tot: 0h5m54s  (81.0%)5.1%  lr: 0.009540  loss: 0.005801  eta: 0h1m  tot: 0h5m55s  (81.0%)5.3%  lr: 0.009520  loss: 0.005897  eta: 0h1m  tot: 0h5m55s  (81.1%)6.0%  lr: 0.009449  loss: 0.005709  eta: 0h1m  tot: 0h5m55s  (81.2%)7.0%  lr: 0.009319  loss: 0.005875  eta: 0h1m  tot: 0h5m56s  (81.4%)8.1%  lr: 0.009179  loss: 0.005792  eta: 0h1m  tot: 0h5m57s  (81.6%)10.0%  lr: 0.008999  loss: 0.005828  eta: 0h1m  tot: 0h5m59s  (82.0%)10.2%  lr: 0.008999  loss: 0.005857  eta: 0h1m  tot: 0h5m59s  (82.0%)10.7%  lr: 0.008979  loss: 0.005969  eta: 0h1m  tot: 0h5m59s  (82.1%)11.1%  lr: 0.008959  loss: 0.006031  eta: 0h1m  tot: 0h6m0s  (82.2%)11.4%  lr: 0.008939  loss: 0.005947  eta: 0h1m  tot: 0h6m0s  (82.3%)11.6%  lr: 0.008919  loss: 0.005951  e

In [70]:
starspace_embeddings = dict()
for line in open('word_embeddings.tsv', encoding='utf-8'):
    row = line.strip().split('\t')
    starspace_embeddings[row[0]]=np.array(row[1:],dtype=np.float32)

In [71]:
posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t')

In [72]:
posts_df.head()

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#


In [73]:
starspace_embeddings['c++'].shape

(100,)

In [74]:
counts_by_tag = posts_df.groupby(['tag']).count()['post_id']

In [75]:
posts_df.groupby(['tag']).count()

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,394451,394451
c_cpp,281300,281300
java,383456,383456
javascript,375867,375867
php,321752,321752
python,208607,208607
r,36359,36359
ruby,99930,99930
swift,34809,34809
vb,35044,35044


In [76]:
print(counts_by_tag.items)

<bound method Series.iteritems of tag
c#            394451
c_cpp         281300
java          383456
javascript    375867
php           321752
python        208607
r              36359
ruby           99930
swift          34809
vb             35044
Name: post_id, dtype: int64>


### Data Structure maintained for each tag
- tag_post_ids — a list of post_ids with shape (counts_by_tag[tag],). It will be needed to show the title and link to the thread;
- tag_vectors — a matrix with shape (counts_by_tag[tag], embeddings_dim (100) ) where embeddings for each answer are stored.

In [77]:
print(starspace_embeddings['c#'])
starspace_embeddings['c_cpp'] = list(starspace_embeddings['c++'])

[ 0.0164733  -0.0361109  -0.00115856  0.0310805  -0.0185563   0.0281558
 -0.0159352   0.00664095 -0.00725065 -0.0210924   0.00820591  0.0191486
 -0.00609667 -0.0122575  -0.0278109  -0.0981103   0.00487486 -0.00328123
 -0.0487754  -0.0963432   0.0124041  -0.012632    0.00735068 -0.0660114
 -0.0637082  -0.0250055   0.0367987  -0.0319628   0.0223168  -0.00589141
 -0.00610673 -0.0631503   0.0840909   0.00898059  0.0354006   0.029524
 -0.04271     0.0154982  -0.0125562   0.0087481  -0.030185    0.0697643
 -0.091609    0.017696    0.0376007   0.0537015   0.0448754  -0.0405891
 -0.00426119 -0.0879259  -0.0594563  -0.120163    0.00023353  0.00200339
  0.00507103 -0.032078    0.06827    -0.0411659  -0.019957    0.0091475
 -0.0143089  -0.00337774 -0.00582512 -0.06628    -0.0660085  -0.0678917
 -0.00311377 -0.0475585   0.0656223   0.0188918  -0.0702528   0.0592485
 -0.0769118  -0.0645789   0.113124   -0.0230251   0.0508459  -0.0825417
  0.00981109  0.0346618  -0.0324306  -0.0176928   0.0382613  -

In [78]:
print(starspace_embeddings['c#'])
starspace_embeddings['c_cpp'] = list(starspace_embeddings['c++'])

[ 0.0164733  -0.0361109  -0.00115856  0.0310805  -0.0185563   0.0281558
 -0.0159352   0.00664095 -0.00725065 -0.0210924   0.00820591  0.0191486
 -0.00609667 -0.0122575  -0.0278109  -0.0981103   0.00487486 -0.00328123
 -0.0487754  -0.0963432   0.0124041  -0.012632    0.00735068 -0.0660114
 -0.0637082  -0.0250055   0.0367987  -0.0319628   0.0223168  -0.00589141
 -0.00610673 -0.0631503   0.0840909   0.00898059  0.0354006   0.029524
 -0.04271     0.0154982  -0.0125562   0.0087481  -0.030185    0.0697643
 -0.091609    0.017696    0.0376007   0.0537015   0.0448754  -0.0405891
 -0.00426119 -0.0879259  -0.0594563  -0.120163    0.00023353  0.00200339
  0.00507103 -0.032078    0.06827    -0.0411659  -0.019957    0.0091475
 -0.0143089  -0.00337774 -0.00582512 -0.06628    -0.0660085  -0.0678917
 -0.00311377 -0.0475585   0.0656223   0.0188918  -0.0702528   0.0592485
 -0.0769118  -0.0645789   0.113124   -0.0230251   0.0508459  -0.0825417
  0.00981109  0.0346618  -0.0324306  -0.0176928   0.0382613  -

In [79]:
import os
os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)
embeddings_dim = 100
for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id']
    
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim)
        
    # Dump post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))

### Dialog Manager

In [80]:
from dialogue_manager import DialogueManager
from utils import *

In [81]:
dialogue_manager = DialogueManager(paths=RESOURCE_PATH)

Loading resources...
Training ai.yml: [####################] 100%
Training botprofile.yml: [####################] 100%
Training computers.yml: [####################] 100%
Training conversations.yml: [####################] 100%
Training emotion.yml: [####################] 100%
Training food.yml: [####################] 100%
Training gossip.yml: [####################] 100%
Training greetings.yml: [####################] 100%
Training health.yml: [####################] 100%
Training history.yml: [####################] 100%
Training humor.yml: [####################] 100%
Training literature.yml: [####################] 100%
Training money.yml: [####################] 100%
Training movies.yml: [####################] 100%
Training politics.yml: [####################] 100%
Training psychology.yml: [####################] 100%
Training science.yml: [####################] 100%
Training sports.yml: [####################] 100%
Training trivia.yml: [####################] 100%


In [82]:
print(dialogue_manager.generate_answer('c++ vs java'))

thread_embeddings_by_tags java
thread_embeddings_by_tags/java.pkl
(100,) (383456, 100)
[35138]
(383456,)
['35138', '3614812']
35138 3614812
https://stackoverflow.com/questions/3614812
