In [None]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import re
import numpy as np
import gensim
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import fasttext

eng_stopwords = stopwords.words('english')

In [None]:
# from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
# model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
# pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [None]:
textmodel = fasttext.load_model('fasttext_model.bin')

def predict_lang(sentence):
    return textmodel.predict([sentence])[0][0][-1][-2:]

In [None]:
def clean_csv():
    df = pd.read_csv("data/model-counsel-chat.csv", encoding='UTF8')
    df.drop(columns="Unnamed: 0", inplace=True)
    answers_in_english = []
    for a in df['answer']:
        label = pipe(a[:min(len(a)+1, 512)])[0]['label']
        answers_in_english.append(label == 'en')
        if label != 'en':
            print("Text not in english: {}".format(a))
        print("Processed {} texts".format(len(answers_in_english)))

    print(answers_in_english)
    df = df[answers_in_english]
    print(df)
    df.to_csv('data/model-counsel-chat-eng.csv', encoding='UTF8')

In [None]:
df = pd.read_csv("data/model-counsel-chat.csv", encoding='latin1')
df.drop(columns="Unnamed: 0", inplace=True)

dot_caps_regex = re.compile(r'(?<=\.)(?=[A-Z])')
newline_regex = re.compile(r'\n|(\r\n)')
weird_regex = re.compile(r'Â\xa0')

file_docs = []
for i in range(len(df['answer'])):
    row = df.iloc[i]
    topic = row[0]
    sentence = row[1]
    print(topic, sentence)
    tokens = sent_tokenize(sentence)
    tokens = [weird_regex.sub(" ", tok) for tok in tokens]
    tokens = [dot_caps_regex.split(newline_regex.sub(" ", tok)) for tok in tokens]
    tokens = flatten(tokens)

    for line in tokens:
        # print(line)
        lang = predict_lang(line)
        if lang == 'en':
            file_docs.append((line, topic))
        else:
            print("Sentence not in english (in {}): {}".format(lang, line))

print("Number of documents:", len(file_docs))

In [None]:
grouped_file_docs = {}

for line, topic in file_docs:
    if not grouped_file_docs.get(topic):
        grouped_file_docs.setdefault(topic, [line])
    else:
        grouped_file_docs[topic].append(line)

for topic in grouped_file_docs:
    docs = grouped_file_docs[topic]
    grouped_file_docs[topic] = ' '.join(docs)

In [None]:
file_docs = grouped_file_docs
topics = list(file_docs.keys())
topics

In [None]:
topìcs

In [None]:
file_docs

In [None]:
lmtz = WordNetLemmatizer()
stmr = PorterStemmer()

def preprocess_sentence(sentence):
    sentence = [w.lower() for w in word_tokenize(sentence)]
    sentence = [word for word in sentence if word not in eng_stopwords]
    sentence = [stmr.stem(lmtz.lemmatize(word)) for word in sentence]
    sentence = [word for word in sentence if re.match(r"^[a-z]+$", word)]
    return sentence

In [None]:
gen_docs = [preprocess_sentence(file_docs[topic]) for topic in file_docs]

In [None]:
len(gen_docs)

In [None]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary.token2id)

dictionary.save('text_to_context.dictionary')

In [None]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

In [None]:
corpus

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print(doc)
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

In [None]:
tf_idf.save('text_to_context.tfidf')

In [None]:
sims = gensim.similarities.Similarity('tfidf_model/',tf_idf[corpus], num_features=len(dictionary))
sims.save(fname="text_to_context.model")

In [None]:
file2_docs = [r"I've not been feeling well lately. I think it was my family, they always get annoying and demanding. Ugh."]

for line in file2_docs:
    query_doc = preprocess_sentence(line)
    query_doc_bow = dictionary.doc2bow(query_doc)
    # perform a similarity query against the corpus
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # print(document_number, document_similarity)
    pred = sims[query_doc_tf_idf]
    print('Comparing Result:', pred)
    index = topics[np.argmax(pred)]
    print(index)

In [None]:
loaded_model = gensim.similarities.Similarity.load('text_to_context.model')

In [None]:
file2_docs = [r"Lol that's funny.I hate it."]

for line in file2_docs:
    query_doc = preprocess_sentence(line)
    query_doc_bow = dictionary.doc2bow(query_doc)
    # perform a similarity query against the corpus
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # print(document_number, document_similarity)
    pred = loaded_model[query_doc_tf_idf]
    print('Comparing Result:', pred)
    index = topics[np.argmax(pred)]
    print(index)

In [1]:
from TextToContext import TextToContext

In [2]:
ttc = TextToContext()

In [3]:
ttc.predict("I'm trying to learn blender and unity and Lua and it's really stressful when you can't figure something out. Any suggestion how to overcome it?")

[('anxiety', 0.035926), ('depression', 0.021193981), ('self-esteem', 0.02088036), ('relationship-dissolution', 0.016286997), ('marriage', 0.014989097), ('relationships', 0.014099595), ('intimacy', 0.013279834), ('addiction', 0.011866848), ('stress', 0.011655326), ('parenting', 0.010639798), ('self-harm', 0.010303146), ('trauma', 0.00989696), ('counseling-fundamentals', 0.009223869), ('family-conflict', 0.006978291), ('anger-management', 0.0056708725), ('behavioral-change', 0.0043245116), ('domestic-violence', 0.0041926396), ('spirituality', 0.0034696066)]
