In [None]:
import sklearn
import pandas as pd
import numpy as np
import nltk
import sklearn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import matplotlib
from sklearn.model_selection import cross_val_score
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.model_selection import train_test_split
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import pickle
from wordcloud import WordCloud

In [None]:
full_labels = pd.read_csv('../input_data/full_datasets/fully_labelled_texts.csv')

In [None]:
full_labels

In [None]:
import pickle
smoking_model = pickle.load(open('../data_exploration/best_models/best_smoking_ex1.pkl', 'rb'))
drinking_model = pickle.load(open('../data_exploration/best_models/best_drinking_ex1.pkl', 'rb'))
drugs_model = pickle.load(open('../data_exploration/best_models/best_drugs_ex1.pkl', 'rb'))

In [None]:
stemmer = SnowballStemmer("dutch")
np.random.seed(500)

def create_preprocess_corpus(input_corpus, content_name, label_name, smoking=False, second_experiment=False, skip_rows=[]):
    corpus = input_corpus[['Unnamed: 0', content_name, label_name]]
    corpus = corpus.rename({content_name:'text', label_name: 'label'}, axis=1)
    corpus['processed_text'] = corpus['text'].str.replace('\t',' ')
    corpus.drop_duplicates(inplace=True)
    corpus['processed_text'] = corpus['processed_text'].astype(str)
    corpus['processed_text'] = corpus['processed_text'].str.lower()
    corpus['processed_text'] = [stemmer.stem(text) for text in corpus['processed_text']]
    if second_experiment:
        if smoking:
            replace_text = 'Rookt niet'
            corpus['label'] = corpus['label'].str.replace('Niets gevonden', 'Rookt niet')
            corpus['label'] = corpus['label'].str.replace('Rookte', 'Rookt niet')
        else:
            corpus['label'] = corpus['label'].str.replace('Niets gevonden', 'Nee')
        
    corpus = corpus.drop(corpus[corpus.label == '--'].index)
    corpus = corpus.drop(corpus[corpus.label == 'Onbekend'].index)
    corpus = corpus[~corpus.processed_text.str.contains('vertrouwelijk')]
    corpus_backup = corpus.copy()
    return corpus, corpus_backup

def add_processed_text(input_corpus, text_column):
    corpus = input_corpus.copy()
    corpus['processed_text'] = corpus[text_column].str.replace('\t',' ')
    corpus.drop_duplicates(inplace=True)
    corpus['processed_text'] = corpus['processed_text'].astype(str)
    corpus['processed_text'] = corpus['processed_text'].str.lower()
    corpus['processed_text'] = [stemmer.stem(text) for text in corpus['processed_text']]
    return corpus

In [None]:
smoking_corpus, _ = create_preprocess_corpus(full_labels, 'content', 'roken_answer_label', smoking=True)
drinking_corpus, _ = create_preprocess_corpus(full_labels, 'content', 'alcohol_answer_label')
drugs_corpus, _ = create_preprocess_corpus(full_labels, 'content', 'drugs_answer_label')

In [None]:
smoking_corpus

In [None]:



drinking_corpus

In [None]:
_, smoking_x_test, _, smoking_y_test = train_test_split(smoking_corpus['processed_text'], smoking_corpus['label'], test_size=0.2, random_state=50)
_, drinking_x_test, _, drinking_y_test = train_test_split(drinking_corpus['processed_text'], drinking_corpus['label'], test_size=0.2, random_state=50)
_, drugs_x_test, _, drugs_y_test = train_test_split(drugs_corpus['processed_text'], drugs_corpus['label'], test_size=0.2, random_state=50)

# Smoking

In [None]:
predicted_nb_df = smoking_model.decision_function(smoking_x_test)
predicted_nb_labels = smoking_model.predict(smoking_x_test)
x_test_probs = pd.DataFrame(smoking_x_test, columns=['processed_text'])
x_test_probs['nothing_found_eval'] = [x[0] for x in predicted_nb_df]
x_test_probs['current_user_eval'] = [x[1] for x in predicted_nb_df]
x_test_probs['non_user_eval'] = [x[2] for x in predicted_nb_df]
x_test_probs['previous_user_eval'] = [x[3] for x in predicted_nb_df]
x_test_probs['predicted_label'] = predicted_nb_labels
x_test_probs['roken_answer_label'] = smoking_y_test
x_test_probs = x_test_probs.merge(smoking_corpus[['text', 'processed_text']], left_on='processed_text', right_on='processed_text', how='left')

In [None]:
sorted_probs_current = x_test_probs[x_test_probs.predicted_label != 'Rookt'].sort_values(by=['current_user_eval'], ascending=False)
sorted_probs_non = x_test_probs[~x_test_probs.processed_text.isin(sorted_probs_current.head(100).processed_text)][x_test_probs.predicted_label != 'Rookt niet'].sort_values(by=['non_user_eval'], ascending=False)
texts_to_label = sorted_probs_current[['text', 'processed_text', 'roken_answer_label']].head(100)
#texts_to_label = texts_to_label.append(sorted_probs_previous[~sorted_probs_previous.processed_text.isin(texts_to_label.processed_text)][['text', 'processed_text', 'roken_answer_label']].head(67))
texts_to_label = texts_to_label.append(sorted_probs_non[~sorted_probs_non.processed_text.isin(texts_to_label.processed_text)][['text', 'processed_text', 'roken_answer_label']].head(100))
texts_to_label = texts_to_label.merge(drinking_corpus.rename(columns={'label': 'alcohol_answer_label'})[['processed_text', 'alcohol_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label = texts_to_label.merge(drugs_corpus.rename(columns={'label': 'drugs_answer_label'})[['processed_text', 'drugs_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label = texts_to_label.drop_duplicates()

In [None]:
texts_to_label

# Drinking

In [None]:
predicted_nb_df = drinking_model.decision_function(drinking_x_test)
predicted_nb = drinking_model.predict(drinking_x_test)
x_test_probs = pd.DataFrame(drinking_x_test, columns=['processed_text'])
x_test_probs['current_user_eval'] = [x[0] for x in predicted_nb_df]
x_test_probs['non_user_eval'] = [x[1] for x in predicted_nb_df]
x_test_probs['nothing_found_eval'] = [x[2] for x in predicted_nb_df]
x_test_probs['predicted_label'] = predicted_nb
x_test_probs['alcohol_answer_label'] = drinking_y_test
x_test_probs = x_test_probs.merge(drinking_corpus[['text', 'processed_text']], left_on='processed_text', right_on='processed_text', how='left')

In [None]:
sorted_probs_current = x_test_probs[x_test_probs.predicted_label != 'Ja'].sort_values(by=['current_user_eval'], ascending=False)
sorted_probs_non = x_test_probs[~x_test_probs.processed_text.isin(sorted_probs_current.head(100).processed_text)][x_test_probs.predicted_label != 'Nee'].sort_values(by=['non_user_eval'], ascending=False)
texts_to_label_ac = sorted_probs_current[~sorted_probs_current.processed_text.isin(texts_to_label.processed_text)][['text', 'processed_text', 'alcohol_answer_label']].drop_duplicates().head(100)
texts_to_label_ac = texts_to_label_ac.append(sorted_probs_non[(~sorted_probs_non.processed_text.isin(texts_to_label_ac.processed_text)) & (~sorted_probs_non.processed_text.isin(texts_to_label.processed_text))][['text', 'processed_text', 'alcohol_answer_label']].drop_duplicates().head(100))
texts_to_label_ac = texts_to_label_ac.merge(smoking_corpus.rename(columns={'label': 'roken_answer_label'})[['processed_text', 'roken_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label_ac = texts_to_label_ac.merge(drugs_corpus.rename(columns={'label': 'drugs_answer_label'})[['processed_text', 'drugs_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label_merged = texts_to_label.append(texts_to_label_ac, ignore_index = True)
texts_to_label_merged = texts_to_label_merged.drop_duplicates()

In [None]:
texts_to_label_merged

# Drugs

In [None]:
predicted_nb_df = drugs_model.decision_function(drugs_x_test)
predicted_nb = drugs_model.predict(drugs_x_test)
x_test_probs = pd.DataFrame(drugs_x_test, columns=['processed_text'])
x_test_probs['current_user_eval'] = [x[0] for x in predicted_nb_df]
x_test_probs['non_user_eval'] = [x[1] for x in predicted_nb_df]
x_test_probs['nothing_found_eval'] = [x[2] for x in predicted_nb_df]
x_test_probs['predicted_label'] = predicted_nb
x_test_probs['drugs_answer_label'] = drugs_y_test
x_test_probs = x_test_probs.merge(drugs_corpus[['text', 'processed_text']], left_on='processed_text', right_on='processed_text', how='left')

In [None]:
sorted_probs_current = x_test_probs[x_test_probs.predicted_label != 'Ja'].sort_values(by=['current_user_eval'], ascending=False)
sorted_probs_non = x_test_probs[~x_test_probs.processed_text.isin(sorted_probs_current.head(100).processed_text)][x_test_probs.predicted_label != 'Nee'].sort_values(by=['non_user_eval'], ascending=False)
texts_to_label_dr = sorted_probs_current[['text', 'processed_text', 'drugs_answer_label']][~sorted_probs_current.processed_text.isin(texts_to_label_merged.processed_text)].head(100)
texts_to_label_dr = texts_to_label_dr.append(sorted_probs_non[(~sorted_probs_non.processed_text.isin(texts_to_label_dr.processed_text)) & (~sorted_probs_non.processed_text.isin(texts_to_label_merged.processed_text))][['text', 'processed_text', 'drugs_answer_label']].drop_duplicates().head(100))
texts_to_label_dr = texts_to_label_dr.merge(smoking_corpus.rename(columns={'label': 'roken_answer_label'})[['processed_text', 'roken_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label_dr = texts_to_label_dr.merge(drinking_corpus.rename(columns={'label': 'alcohol_answer_label'})[['processed_text', 'alcohol_answer_label']], left_on='processed_text', right_on='processed_text', how='left')
texts_to_label_merged_merged = texts_to_label_merged.append(texts_to_label_dr, ignore_index = True)
texts_to_label_merged_merged = texts_to_label_merged_merged.drop_duplicates()

In [None]:
texts_to_label_merged_merged

In [None]:
texts_to_label = texts_to_label_merged_merged.drop_duplicates()

In [None]:
texts_to_label

In [None]:
full_labels

In [None]:
full_labels_no_dup = full_labels[~full_labels.content.isin(texts_to_label.text)]

In [None]:
full_labels_no_dup

In [None]:
random_samples = full_labels_no_dup.sample(frac=1).head(1000 - len(texts_to_label.index))

In [None]:
full_samples = add_processed_text(random_samples, 'content')

In [None]:
full_samples

In [None]:
text	processed_text	roken_answer_label	alcohol_answer_label	drugs_answer_label

In [None]:
full_texts_to_label = texts_to_label[['text', 'roken_answer_label', 'alcohol_answer_label', 'drugs_answer_label']].append(full_samples.rename(columns={'content':'text'})[['text', 'roken_answer_label', 'alcohol_answer_label', 'drugs_answer_label']])

In [None]:
full_texts_to_label

In [None]:
full_texts_to_label.to_csv('full_texts_to_label_full4.csv')

In [None]:
full_labels_no_dup.sample(frac=1).head(1)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
full_labels_no_dup.sample(frac=1).head(1)