In [1]:
# Load data preprocessing libs
import pandas as pd
import numpy as np

# Load vectorizer and similarity measure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Read the data

In [2]:
# Read data and drop examples that has no answer
df = pd.read_csv("aws_faq.csv")
df.dropna(inplace=True)
df

Unnamed: 0,Question,Answer
0,What is Amazon Elastic Compute Cloud (Amazon E...,Amazon Elastic Compute Cloud (Amazon EC2) is a...
1,What can I do with Amazon EC2?,Just as Amazon Simple Storage Service (Amazon ...
2,How can I get started with Amazon EC2?,"To sign up for Amazon EC2, click the “Sign up ..."
3,Why am I asked to verify my phone number when ...,Amazon EC2 registration requires you to have a...
4,What can developers now do that they could not...,"Until now, small developers did not have the c..."
5,How do I run systems in the Amazon EC2 environ...,Once you have set up your account and select o...
6,What is the difference between using the local...,When you launch your Amazon EC2 instances you ...
7,How quickly will systems be running?,It typically takes less than 10 minutes from t...
8,How do I load and store my systems with Amazon...,Amazon EC2 allows you to set up and configure ...
9,How do I access my systems?,The RunInstances call that initiates execution...


## Train the vectorizer

In [3]:
vectorizer = TfidfVectorizer()
vectorizer.fit(np.concatenate((df.Question, df.Answer)))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Vectorize questions

In [4]:
Question_vectors = vectorizer.transform(df.Question)


## Chat with the user

In [5]:
print("You can start chatting with me nowhi.")
while True:
    # Read user input
    input_question = input()

    # Locate the closest question
    input_question_vector = vectorizer.transform([input_question])

    # Compute similarities
    similarities = cosine_similarity(input_question_vector, Question_vectors)

    # Find the closest question
    closest = np.argmax(similarities, axis=1)

    # Print the correct answer
    print("BOT: " + df.Answer.iloc[closest].values[0])

You can start chatting with me nowhi.

BOT: Amazon Elastic Compute Cloud (Amazon EC2) is a web service that provides resizable compute capacity in 
the cloud
. It is designed to make web-scale computing easier for developers.
Good Morning
BOT: Hello, good morning.
are you the one
BOT: I’m fine thanks.
May I see Gdi？
BOT: The AWS Management Console makes a detailed billing report available which shows Spot instance start and termination/stop times for all instances. Customers can check the billing report against historical Spot prices via the API to verify that the Spot price they were billed is correct.
How quickly will systems be running
BOT: It typically takes less than 10 minutes from the issue of the RunInstances call to the point where all requested instances begin their boot sequences. This time depends on a number of factors including: the size of your AMI, the number of instances you are launching, and how recently you have launched that AMI. Images launched for the first time 

KeyboardInterrupt: 