In [1]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.corpus import wordnet 
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk import pos_tag 
from sklearn.metrics import pairwise_distances 
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer 

In [2]:
data=pd.read_csv('dialogs.txt',sep='\t',names=['question','answer'])
print(f'Размер: {len(data)}')
data.head(10)


Размер: 3725


Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
5,i've been good. i'm in school right now.,what school do you go to?
6,what school do you go to?,i go to pcc.
7,i go to pcc.,do you like it there?
8,do you like it there?,it's okay. it's a really big campus.
9,it's okay. it's a really big campus.,good luck with school.


In [3]:
data.isna().sum()

question    0
answer      0
dtype: int64

In [4]:
# nltk.download('stopwords')

In [5]:
def predobrabotka(text):
    text=str(text).lower() 
    tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(text)
    lematizer=nltk.wordnet.WordNetLemmatizer() 
    tag = nltk.pos_tag(tokens)[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tag = tag_dict.get(tag, wordnet.NOUN)
    lem_words = [lematizer.lemmatize(w, tag) for w in tokens]
    # stop = stopwords.words('english')
    # cleaned_tokens = [word for word in lem_words if not  word in stop]

    return " ".join(lem_words)

In [6]:
predobrabotka("hi, how are you doing?")

'hi how be you do'

In [7]:
data['lemmatized_question']=data['question'].apply(predobrabotka) 

In [8]:
data.head()

Unnamed: 0,question,answer,lemmatized_question
0,"hi, how are you doing?",i'm fine. how about yourself?,hi how be you do
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,i m fine how about yourself
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,i m pretty good thanks for asking
3,no problem. so how have you been?,i've been great. what about you?,no problem so how have you been
4,i've been great. what about you?,i've been good. i'm in school right now.,i ve been great what about you


In [9]:
vector = TfidfVectorizer()
tfidf_transformer = TfidfTransformer()

tfidf=vector.fit_transform(data['lemmatized_question']).toarray() 
data_tfidf=pd.DataFrame(tfidf,columns=vector.get_feature_names_out()) 
data_tfidf.head()
#нет смысла удалять цифры, ведь в dataset они часть ответов к конткретным вопорсам

Unnamed: 0,00,000,01,10,100,101,11,12,120,13,...,york,you,young,your,yours,yourself,yuck,yy,zip,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.22396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.605395,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.204695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.218266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
def chat():
    lemma=predobrabotka(text) # calling the function to perform text normalization
    tf=vector.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(data_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return data['answer'].loc[index_value]

In [11]:
chat("Do you like the famous actor Jonny Depp?")

'like real people with real problems.'

In [12]:
chat("Do you like magic?")

'oh, yes, i really like it.'

In [13]:
chat('Do you believe in a wizard word?')

'of course.'

In [14]:
chat('what language do you speak?')

"well, my english isn't perfect."

In [15]:
chat('what was the reason for your call?')

'i want to do something tomorrow with you.'

In [16]:
chat('Ohhh. What did you have in mind?')

'i was thinking about seeing a movie.'

In [17]:
chat('Which movie do you want to watch with me?')

'i have to say, my favorite movie is superbad.'

In [18]:
chat("It's doesn't matter, so let's wathch it tommorow")

"it's too much like other flags."

In [19]:
chat('do yoou thinl I am an abuser')

'i hope so. i have to go to the bathroom.'

In [20]:
chat('can you help me please')

'why me?'

In [21]:
chat('because tou are my friend')

'of course. i have lots of best friends.'

In [22]:
chat('is not it a reason')

"i just really don't want to go."

In [23]:
chat('fine')

"i'm having a party this friday."

In [24]:
chat("Why didn't you tell me?")

"i would've thought that somebody would have told you."

In [25]:
chat('but this celebration is your initiative')

'an andy warhol drawing.'

In [26]:
chat("You're insufferable.")

'well, listen to me. you need to buy a ticket now.'

In [27]:
chat('where')

'what do you mean?'

In [28]:
chat('what')

'people talk about current events.'

In [29]:
chat('which events?')

'deal or no deal'

In [30]:
chat('with what?')

"i'm taking a math course in school."

In [31]:
chat('good luck')

'thanks.'