In [2]:
import pandas as pd
import glob
import numpy as np
from rake_nltk import Rake
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

In [3]:
all_stopwords_gensim = STOPWORDS.union(set(['likes', 'explain', 'like', 'it\'s', 'tell']))

In [1]:
def dataframe_reading_multiple_file(path, file_type, column_list,header_type):
    path = path
    dir_files = glob.glob(path + f"/*.{file_type}")
    final_df = pd.DataFrame(columns = column_list)

    for file_name in dir_files:
        print(file_name)
        temp_df = pd.read_csv(file_name, header=header_type)
        final_df = pd.concat([final_df,temp_df])
    return final_df

In [4]:

def extract_keyword(sen):
    keyword_list = create_n_grams(sen, dictionary_list)
    return keyword_list
    

In [5]:
dictionary_directory = "chatbot/dictionaries"
complete_dictionary = dataframe_reading_multiple_file(dictionary_directory, "txt", [], None)
complete_dictionary = complete_dictionary.drop(1, axis=1)
de_dupe_dict = complete_dictionary.drop_duplicates
dictionary_list = complete_dictionary[0].tolist()
print(len(dictionary_list))

chatbot/dictionaries/deep-learning.txt
chatbot/dictionaries/machine-learning.txt
chatbot/dictionaries/question-tags.txt
chatbot/dictionaries/data-mining.txt
chatbot/dictionaries/statistics.txt
chatbot/dictionaries/artificial-intelligence.txt
1550


In [6]:

def create_n_grams(question, dictionary_directory):
  text = question
  tokens = word_tokenize(text.lower())
  tokens_without_sw = [word for word in tokens if not word in all_stopwords_gensim]
  filtered_unigram = [item for item in tokens_without_sw if item in dictionary_list]
  excluded_unigram = [item for item in tokens_without_sw if item not in dictionary_list]
  bigrams = list(ngrams(tokens,2))
  trigrams = list(ngrams(tokens,3))
  fourgrams = list(ngrams(tokens,4))
  fivegrams = list(ngrams(tokens,5))

  filterd_bigram = [" ".join(item) for item in bigrams if " ".join(item) in dictionary_list]
  filterd_trigram = [" ".join(item) for item in trigrams if " ".join(item) in dictionary_list]
  filterd_fourgram = [" ".join(item) for item in fourgrams if " ".join(item) in dictionary_list]

  combined_list = list(set(filtered_unigram + filterd_bigram + filterd_trigram + filterd_fourgram))
  return combined_list

In [7]:
question_path = "./csv-files/Question"
columns_list = ['line', 'question', 'question_type']
cdf = dataframe_reading_multiple_file(question_path, "csv", columns_list, 'infer')

./csv-files/Question/LogisticReg.csv
./csv-files/Question/neural_network.csv
./csv-files/Question/random_forest.csv
./csv-files/Question/PCA_SVD.csv
./csv-files/Question/statistics.csv
./csv-files/Question/naive_bayes.csv
./csv-files/Question/TS_forecasting.csv
./csv-files/Question/LinearReg.csv
./csv-files/Question/MBA_AR.csv
./csv-files/Question/KNN.csv
./csv-files/Question/clustering.csv
./csv-files/Question/code.csv
./csv-files/Question/CF_RS.csv
./csv-files/Question/text_mining.csv
./csv-files/Question/SVM.csv
./csv-files/Question/decision_tree.csv


In [8]:
cdf.head()

Unnamed: 0,line,question,question_type
0,1,When is Simple Linear Regression employed?,logistic regression
1,2,when is simple linear regression employed in m...,logistic regression
2,3,when is simple linear regression employed in d...,logistic regression
3,4,when is simple linear regression employed in R,logistic regression
4,5,when is simple linear regression employed in p...,logistic regression


In [9]:
cdf['keywords'] = cdf.apply(lambda item: extract_keyword(item['question']), axis=1)

In [10]:
cdf.head()

Unnamed: 0,line,question,question_type,keywords
0,1,When is Simple Linear Regression employed?,logistic regression,"[linear regression, regression]"
1,2,when is simple linear regression employed in m...,logistic regression,"[linear regression, regression, machine learni..."
2,3,when is simple linear regression employed in d...,logistic regression,"[linear regression, data science, regression, ..."
3,4,when is simple linear regression employed in R,logistic regression,"[linear regression, regression, r]"
4,5,when is simple linear regression employed in p...,logistic regression,"[linear regression, regression, python]"


In [1]:
# !pip install deepcorrect

Collecting deepcorrect
  Downloading deepcorrect-1.0.5-py2.py3-none-any.whl (14 kB)
Collecting txt2txt==1.0.9
  Downloading txt2txt-1.0.9-py2.py3-none-any.whl (5.4 kB)
Installing collected packages: txt2txt, deepcorrect
Successfully installed deepcorrect-1.0.5 txt2txt-1.0.9


In [10]:
from deepcorrect import DeepCorrect
corrector = DeepCorrect('pre-trained-models/deeppunct_params_en', 'pre-trained-models/deeppunct_checkpoint_tatoeba_cornell')

Loading the params file
Input encoding {'o': 2, '{': 3, '.': 4, 'J': 5, '0': 6, '1': 7, '<': 8, 'B': 9, 'd': 10, '£': 11, 'e': 12, '6': 13, '!': 14, 'O': 15, 'M': 16, 'X': 17, 'f': 18, 't': 19, 'C': 20, 'V': 21, 'z': 22, 'K': 23, '\\': 24, '9': 25, 'P': 26, 'S': 27, '/': 28, '₹': 29, 'F': 30, 'G': 31, '=': 32, '8': 33, ')': 34, '+': 35, ']': 36, 'U': 37, "'": 38, '"': 39, 'g': 40, 'N': 41, 'r': 42, 'u': 43, '&': 44, '$': 45, 'x': 46, '%': 47, ':': 48, '@': 49, '^': 50, 'I': 51, 'L': 52, 'Z': 53, 'h': 54, 'W': 55, 'A': 56, 'v': 57, '?': 58, '2': 59, '~': 60, 's': 61, 'T': 62, 'R': 63, ',': 64, '|': 65, '4': 66, '>': 67, 'y': 68, '(': 69, '[': 70, 'k': 71, 'H': 72, 'l': 73, 'j': 74, '7': 75, 'n': 76, 'i': 77, 'D': 78, 'Q': 79, ' ': 80, 'm': 81, 'Y': 82, '*': 83, '}': 84, '#': 85, 'p': 86, 'q': 87, '5': 88, 'c': 89, '`': 90, 'a': 91, 'b': 92, 'w': 93, '3': 94, 'E': 95, ';': 96, '-': 97}
Input decoding {2: 'o', 3: '{', 4: '.', 5: 'J', 6: '0', 7: '1', 8: '<', 9: 'B', 10: 'd', 11: '£', 12: '

In [14]:
y = corrector.correct("what is nrmal distribtion")
print(y)

[{'sequence': 'What is nrmal distribtion.', 'prob': 0.5575786205068979}]
