In [1]:
import pandas as pd
from nlpia.data.loaders import get_data


In [4]:
df = pd.read_csv('/data/ubuntu-dialog/trainset.csv', header=0)

In [5]:
df.head()

Unnamed: 0,Context,Utterance,Label
0,i think we could import the old comments via r...,basically each xfree86 upload will NOT force u...,1.0
1,I'm not suggesting all - only the ones you mod...,oh? oops. __eou__,1.0
2,afternoon all __eou__ not entirely related to ...,we'll have a BOF about this __eou__ so you're ...,1.0
3,interesting __eou__ grub-install worked with /...,i fully endorse this suggestion </quimby> __eo...,1.0
4,and because Python gives Mark a woody __eou__ ...,(i thought someone was going to make a joke ab...,1.0


In [None]:

df = get_data('ubuntu_dialog')
print(df.shape)
df.head()


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
vectors = tfidf.fit_transform(['hello world', 'another time'])
vectors

<2x4 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [6]:
vectors = vectors.todense()
vectors

matrix([[ 0.        ,  0.70710678,  0.        ,  0.70710678],
        [ 0.70710678,  0.        ,  0.70710678,  0.        ]])

In [7]:
tfidf.vocabulary_

{'another': 0, 'hello': 1, 'time': 2, 'world': 3}

In [9]:
tfidf = TfidfVectorizer(min_df=8, max_df=.3, max_features=100000)
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=100000, min_df=8,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
tfidf.fit(pd.concat([df[df.columns[i]] for i in range(11)]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=100000, min_df=8,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [57]:
print(list(tfidf.vocabulary_)[:10])
print(len(tfidf.vocabulary_))

['anyone', 'knows', 'why', 'my', 'stock', 'oneiric', 'exports', 'env', 'var', 'username']
12358


In [19]:
X = tfidf.transform(df.Context)
X = X.todense()

In [20]:
y = tfidf.transform(df['Ground Truth Utterance']).todense()

In [50]:
from sklearn.metrics.pairwise import cosine_distances

In [72]:
def get_statement(s='Hi'):
    q = tfidf.transform([s]).todense()[0]
    similarity = 0
    best_i = 0
    for i, v in enumerate(X):
        # print(i, q, v)
        sim = 2 - cosine_distances(q, v)
        if sim > similarity:
            similarity = sim
            best_i = i
    return df.Context.iloc[best_i], best_i

In [73]:
get_statement('Hello Ubuntu')

('hello __eou__ __eot__ hello, can i help you? __eou__ __eot__ ', 1022)

In [74]:
get_statement("Ubuntu doesn't work on my Macbook Pro!")

("good morning! :-) __eou__ i have a question .. would run ubuntu run better on an apple macbook pro, than on any other laptop? __eou__ because i had a job interview with a company who wants to buy macbook pro's to run ubuntu on .. but then i was wondering: what's the advantage of having a macbook pro then? __eou__ __eot__ that seems like a huge waste of money __eou__ __eot__ ",
 18430)

In [75]:
def get_reply(s='Hi'):
    return df['Ground Truth Utterance'].iloc[get_statement(s)[1]]
    

In [76]:
get_reply('anyone knows why my stock oneiric exports env')

'nice thanks! __eou__'

In [77]:
get_reply('i set up my hd such that i have to type a pass')

'so you dont know, ok, anyone else? __eou__ you are like, yah my mouse doesnt work, reinstall your os lolol what a joke __eou__'

In [92]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
pca = pca.fit(tfidf.transform(df.Context).todense())
X_100d = pca.transform(X)
y_100d = pca.transform(y)

In [93]:
def get_statement_100d(s='Hi'):
    q = pca.transform(tfidf.transform([s]).todense())[0]
    similarity = 0
    best_i = 0
    for i, v in enumerate(X_100d):
        # print(i, q.shape, v.shape)
        sim = 2 - cosine_distances(pd.np.array([q]), pd.np.array([v]))
        if sim > similarity:
            similarity = sim
            best_i = i
    return df.Context.iloc[best_i], best_i

In [94]:
get_statement_100d("Ubuntu doesn't work on my Macbook Pro!")

("how can you make pulseaudio stay dead?  /etc/pulse/client.conf autospawn=no doesn't work killall pulseaudio doesn't work pkill doesnt work pulseaudio --kill doesnt work service pulseaudio stop doesnt work __eou__ __eot__ you could just uninstall it __eou__ __eot__ ",
 17103)

In [95]:
get_statement_100d("me just installed another serial port copier but don't know")

("I use cinnamon __eou__ But i don't know :P __eou__ __eot__ fair enough. I know Kazam has big issues in Gnome3 and cinnamon. __eou__ __eot__ ",
 10644)

In [96]:
get_statement_100d("I just added a second usb printer but not sure")

('obi Its not working without USB stick. Without USB stick it asks to select a boot medium. With USB stick, it boots correctly. __eou__ obi Still the fdisk shows its not bootable. __eou__ __eot__ That display is irrelevant since 15 years :) __eou__ __eot__ ',
 7989)