In [None]:
import sklearn
import pandas as pd
import numpy as np
import nltk
import sklearn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import matplotlib
from sklearn.model_selection import cross_val_score
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.model_selection import train_test_split
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import pickle
from wordcloud import WordCloud

In [None]:
full_reviewed_data_drop = pd.read_csv('../../input_data/full_datasets/fully_hand_labelled_texts.csv')

In [None]:
full_reviewed_data_drop.text = full_reviewed_data_drop.text.str.replace("_x000D_\n", "\t")


In [None]:
indices = pd.read_csv('medrobertanl-haga_smoking_predictions.csv', delimiter=';')

In [None]:
test_set = full_reviewed_data_drop.loc[indices['index']]

In [None]:
train_set = full_reviewed_data_drop.loc[~full_reviewed_data_drop.index.isin(test_set.index)]

In [None]:
train_set

In [None]:
Corpus = full_reviewed_data_drop.copy()

In [None]:
Corpus['text'] = Corpus['text'].astype(str)

# Roken

## Experiment 1

In [None]:
rook_corpus = Corpus[["text", "Roken"]].rename(columns={"Roken":"label"})

In [None]:
rook_corpus

In [None]:
rook_corpus['text'] = rook_corpus['text'].str.replace('\t',' ')
rook_corpus.drop_duplicates(inplace=True)
rook_corpus['text'] = rook_corpus['text'].astype(str)

In [None]:
stemmer = SnowballStemmer("dutch")
rook_corpus['text'] = rook_corpus['text'].str.lower()
rook_corpus['text'] = [stemmer.stem(text) for text in rook_corpus['text']]
rook_corpus = rook_corpus.drop(rook_corpus[rook_corpus.label == '--'].index)
rook_corpus_backup = rook_corpus.copy()

In [None]:
stopwords_filter = ['niet', 'niets', 'geen', 'zonder']
with open('../../helping_files/stopwords.txt') as file:
    full_stopwords = [line.rstrip() for line in file]
    filtered_stopwords = [item for item in full_stopwords if item not in stopwords_filter]

In [None]:
test_set = rook_corpus.loc[indices['index']]

In [None]:
train_set = rook_corpus.loc[~rook_corpus.index.isin(test_set.index)]

In [None]:
train_set

In [None]:
parameter_grid = {'clf__loss':              ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'clf__penalty':           ['l2', 'l1'],
                  'clf__l1_ratio':          sp_randFloat(),
                  'clf__fit_intercept':     [True, False],
                  'clf__max_iter':          [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                  'clf__tol':               sp_randFloat(),
                  'clf__shuffle':           [True, False],
                  'clf__epsilon':           sp_randFloat(),
                  'clf__learning_rate':     ['constant', 'optimal', 'invscaling', 'adaptive'],
                  'clf__eta0':              sp_randFloat(),
                  'clf__power_t':           sp_randFloat(),
                  'clf__class_weight':      ['balanced', None],
                  'clf__warm_start':        [True, False],
                  'clf__average':           [True, False],
                  'tfidf__max_df':          [0.90, 0.95],
                  'tfidf__min_df':          [3, 5]
                 }

In [None]:
# Ngram 2 Stopwords kept
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(2,2), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(train_set['text'], train_set['label'])  
predicted_nb = random_search.predict(test_set['text'])
print(np.mean(predicted_nb == test_set['label']))
cm = confusion_matrix(test_set['label'], predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(test_set['label'], predicted_nb,
    target_names=random_search.classes_))

In [None]:
from pickle import dump

dump(random_search, open('smoking_ml_model.pkl', 'wb'))

In [None]:
test_set['predicted'] = predicted_nb

In [None]:
test_set

In [None]:
test_set.to_csv('sgd_smoking_results.csv')

## Experiment 2

In [None]:
rook_corpus = rook_corpus_backup.copy()
stemmer = SnowballStemmer("dutch")
rook_corpus['text'] = rook_corpus['text'].str.lower()
rook_corpus['text'] = [stemmer.stem(text) for text in rook_corpus['text']]
rook_corpus['label'] = rook_corpus['label'].str.replace('Niets gevonden','Geen gebruiker')
rook_corpus['label'] = rook_corpus['label'].str.replace('Voormalige gebruiker','Geen gebruiker')
rook_corpus = rook_corpus.drop(rook_corpus[rook_corpus.label == '--'].index)
rook_corpus = rook_corpus.drop(rook_corpus[rook_corpus.label == 'Onbekend'].index)
rook_corpus_backup = rook_corpus.copy()

In [None]:
test_set = rook_corpus.loc[indices['index']]
train_set = rook_corpus.loc[~rook_corpus.index.isin(test_set.index)]

In [None]:
parameter_grid = {'clf__loss':              ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'clf__penalty':           ['l2', 'l1'],
                  'clf__l1_ratio':          sp_randFloat(),
                  'clf__fit_intercept':     [True, False],
                  'clf__max_iter':          [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                  'clf__tol':               sp_randFloat(),
                  'clf__shuffle':           [True, False],
                  'clf__epsilon':           sp_randFloat(),
                  'clf__learning_rate':     ['constant', 'optimal', 'invscaling', 'adaptive'],
                  'clf__eta0':              sp_randFloat(),
                  'clf__power_t':           sp_randFloat(),
                  'clf__class_weight':      ['balanced', None],
                  'clf__warm_start':        [True, False],
                  'clf__average':           [True, False],
                  'tfidf__max_df':          [0.90, 0.95],
                  'tfidf__min_df':          [3, 5]
                 }

In [None]:
# Ngram 2 Stopwords kept
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(X_train, y_train)  
predicted_nb = random_search.predict(X_test)
print(np.mean(predicted_nb == y_test))
cm = confusion_matrix(y_test, predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(y_test, predicted_nb,
    target_names=random_search.classes_))

# Alcohol

## Experiment 1

In [None]:
alcohol_corpus = Corpus[["text", "Alcohol"]].rename(columns={"Alcohol":"label"})

In [None]:
alcohol_corpus

In [None]:
alcohol_corpus['text'] = alcohol_corpus['text'].str.replace('\t',' ')
alcohol_corpus.drop_duplicates(inplace=True)
alcohol_corpus['text'] = alcohol_corpus['text'].astype(str)

In [None]:
stemmer = SnowballStemmer("dutch")
alcohol_corpus['text'] = alcohol_corpus['text'].str.lower()
alcohol_corpus['text'] = [stemmer.stem(text) for text in alcohol_corpus['text']]
alcohol_corpus = alcohol_corpus.drop(alcohol_corpus[alcohol_corpus.label == '--'].index)
alcohol_corpus_backup = alcohol_corpus.copy()

In [None]:
no_stopwords = lambda x: ' '.join([item for item in x.split() if item not in full_stopwords])
less_stopwords = lambda x: ' '.join([item for item in x.split() if item not in filtered_stopwords])

alcohol_corpus["text"] = alcohol_corpus["text"].apply(less_stopwords)
X_train, X_test, y_train, y_test = train_test_split(alcohol_corpus['text'], alcohol_corpus['label'], test_size=0.2, random_state=50)

In [None]:
test_set = alcohol_corpus.loc[indices['index']]
train_set = alcohol_corpus.loc[~alcohol_corpus.index.isin(test_set.index)]

In [None]:
# Ngram 2 Less stopwords
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(X_train, y_train)  
predicted_nb = random_search.predict(X_test)
print(np.mean(predicted_nb == y_test))
cm = confusion_matrix(y_test, predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(y_test, predicted_nb,
    target_names=random_search.classes_))

## Experiment 2

In [None]:
alcohol_corpus = alcohol_corpus_backup.copy()
stemmer = SnowballStemmer("dutch")
alcohol_corpus['text'] = alcohol_corpus['text'].str.lower()
alcohol_corpus['text'] = [stemmer.stem(text) for text in alcohol_corpus['text']]
alcohol_corpus['label'] = alcohol_corpus['label'].str.replace('Niets gevonden','Geen gebruiker')
alcohol_corpus['label'] = alcohol_corpus['label'].str.replace('Voormalige gebruiker','Geen gebruiker')
alcohol_corpus = alcohol_corpus.drop(alcohol_corpus[alcohol_corpus.label == '--'].index)
alcohol_corpus = alcohol_corpus.drop(alcohol_corpus[alcohol_corpus.label == 'Onbekend'].index)
alcohol_corpus_backup = alcohol_corpus.copy()

In [None]:
test_set = alcohol_corpus.loc[indices['index']]
train_set = alcohol_corpus.loc[~alcohol_corpus.index.isin(test_set.index)]

In [None]:
parameter_grid = {'clf__loss':              ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'clf__penalty':           ['l2', 'l1'],
                  'clf__l1_ratio':          sp_randFloat(),
                  'clf__fit_intercept':     [True, False],
                  'clf__max_iter':          [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                  'clf__tol':               sp_randFloat(),
                  'clf__shuffle':           [True, False],
                  'clf__epsilon':           sp_randFloat(),
                  'clf__learning_rate':     ['constant', 'optimal', 'invscaling', 'adaptive'],
                  'clf__eta0':              sp_randFloat(),
                  'clf__power_t':           sp_randFloat(),
                  'clf__class_weight':      ['balanced', None],
                  'clf__warm_start':        [True, False],
                  'clf__average':           [True, False],
                  'tfidf__max_df':          [0.90, 0.95],
                  'tfidf__min_df':          [3, 5]
                 }

In [None]:
# Ngram 3 Stopwords kept
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(train_set['text'], train_set['label'])  
predicted_nb = random_search.predict(test_set['text'])
print(np.mean(predicted_nb == test_set['label']))
cm = confusion_matrix(test_set['label'], predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(test_set['label'], predicted_nb,
    target_names=random_search.classes_))

# Drugs

## Experiment 1

In [None]:
drugs_corpus = Corpus[["text", "Drugs"]].rename(columns={"Drugs":"label"})

In [None]:
drugs_corpus

In [None]:
drugs_corpus['text'] = drugs_corpus['text'].str.replace('\t',' ')
drugs_corpus.drop_duplicates(inplace=True)
drugs_corpus['text'] = drugs_corpus['text'].astype(str)

In [None]:
stemmer = SnowballStemmer("dutch")
drugs_corpus['text'] = drugs_corpus['text'].str.lower()
drugs_corpus['text'] = [stemmer.stem(text) for text in drugs_corpus['text']]
drugs_corpus = drugs_corpus.drop(drugs_corpus[drugs_corpus.label == '--'].index)
drugs_corpus_backup = drugs_corpus.copy()

In [None]:
test_set = drugs_corpus.loc[indices['index']]
train_set = drugs_corpus.loc[~drugs_corpus.index.isin(test_set.index)]

In [None]:
parameter_grid = {'clf__loss':              ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'clf__penalty':           ['l2', 'l1'],
                  'clf__l1_ratio':          sp_randFloat(),
                  'clf__fit_intercept':     [True, False],
                  'clf__max_iter':          [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                  'clf__tol':               sp_randFloat(),
                  'clf__shuffle':           [True, False],
                  'clf__epsilon':           sp_randFloat(),
                  'clf__learning_rate':     ['constant', 'optimal', 'invscaling', 'adaptive'],
                  'clf__eta0':              sp_randFloat(),
                  'clf__power_t':           sp_randFloat(),
                  'clf__class_weight':      ['balanced', None],
                  'clf__warm_start':        [True, False],
                  'clf__average':           [True, False],
                  'tfidf__max_df':          [0.90, 0.95],
                  'tfidf__min_df':          [3, 5]
                 }

In [None]:
# Ngram 2 Stopwords kept
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(train_set['text'], train_set['label'])  
predicted_nb = random_search.predict(test_set['text'])
print(np.mean(predicted_nb == test_set['label']))
cm = confusion_matrix(test_set['label'], predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(test_set['label'], predicted_nb,
    target_names=random_search.classes_))

## Experiment 2

In [None]:
drugs_corpus = drugs_corpus_backup.copy()
stemmer = SnowballStemmer("dutch")
drugs_corpus['text'] = drugs_corpus['text'].str.lower()
drugs_corpus['text'] = [stemmer.stem(text) for text in drugs_corpus['text']]
drugs_corpus['label'] = drugs_corpus['label'].str.replace('Niets gevonden','Geen gebruiker')
drugs_corpus['label'] = drugs_corpus['label'].str.replace('Voormalige gebruiker','Geen gebruiker')
drugs_corpus = drugs_corpus.drop(drugs_corpus[drugs_corpus.label == '--'].index)
drugs_corpus = drugs_corpus.drop(drugs_corpus[drugs_corpus.label == 'Onbekend'].index)
drugs_corpus_backup = drugs_corpus.copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(drugs_corpus['text'], drugs_corpus['label'], test_size=0.4, random_state=50, stratify=drugs_corpus['label'])

In [None]:
parameter_grid = {'clf__loss':              ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'clf__penalty':           ['l2', 'l1'],
                  'clf__l1_ratio':          sp_randFloat(),
                  'clf__fit_intercept':     [True, False],
                  'clf__max_iter':          [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                  'clf__tol':               sp_randFloat(),
                  'clf__shuffle':           [True, False],
                  'clf__epsilon':           sp_randFloat(),
                  'clf__learning_rate':     ['constant', 'optimal', 'invscaling', 'adaptive'],
                  'clf__eta0':              sp_randFloat(),
                  'clf__power_t':           sp_randFloat(),
                  'clf__class_weight':      ['balanced', None],
                  'clf__warm_start':        [True, False],
                  'clf__average':           [True, False],
                  'tfidf__max_df':          [0.90, 0.95],
                  'tfidf__min_df':          [3, 5]
                 }

In [None]:
# Ngram 2 Stopwords kept
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), lowercase=True, token_pattern=r"(?u)\b\w\w+\b|\+|\-")),
    ('clf', SGDClassifier(early_stopping=True, n_iter_no_change=5, validation_fraction = 0.25, verbose=3)),
])
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=500, verbose=3, scoring='f1_macro')
random_search.fit(X_train, y_train)  
predicted_nb = random_search.predict(X_test)
print(np.mean(predicted_nb == y_test))
cm = confusion_matrix(y_test, predicted_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=random_search.classes_)
disp.plot()

plt.show()
print(metrics.classification_report(y_test, predicted_nb,
    target_names=random_search.classes_))