In [None]:
# !pip install joblib
# !pip install threadpoolctl
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, StackingClassifier, BaggingClassifier 
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB, ComplementNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading, SelfTrainingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.ensemble import IsolationForest

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

from sklearn.multiclass import OneVsRestClassifier

from collections import Counter
from scipy.sparse import csr_matrix
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os
import re

!pip install urlextract
import urlextract
find_urls = urlextract.URLExtract().find_urls

!pip install empath
from empath import Empath

import nltk
nltk.download('stopwords')
nltk.download('punkt')
stem = nltk.PorterStemmer().stem
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

path = '/Users/hashem/Python/TDI/capstone/data/'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
class get_text(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.text

In [None]:
# to merge two-part texts (text + selftext in submissions)
def merge_text(t1, t2):
    if isinstance(type(t1), str) and t1!='[removed]':
        if isinstance(type(t2),str) and t2!='[removed]':
            text = t1+' '+t2
        else:
            text = t1
    elif isinstance(type(t2),str) and t2 != '[removed]':
        text = t2
    else:
        text = ''
    return text

In [None]:
# make texts lower-case
class lower_case(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X['text'].apply(str.lower)
        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# replace all urls with 'URL'
class replace_urls(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            urls = list(set(find_urls(text)))
            urls.sort(key=lambda x: len(x), reverse=True)
            for url in urls:
                text = text.replace(url, 'URL')
            X_transformed.append(text)

        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# replace all numbers with 'NUM'
class replace_numbers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            text_tr = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUM', text)
            X_transformed.append(text_tr)

        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# limit size of the text
class limit_size(BaseEstimator, TransformerMixin):
    def __init__(self, size):
        self.size = size

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        index = [len(text) < self.size for text in X['text']]
        return X[index]

In [None]:
# remove punctuations
class remove_punctuation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            text_tr = re.sub(r'\W+', ' ', text, flags=re.M)
            X_transformed.append(text_tr)

        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# remove key words
class remove_key_words(BaseEstimator, TransformerMixin):
    def __init__(self, remove_list):
        self.remove_list = remove_list

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            for word in self.remove_list:
                text = text.replace(word, '')
            X_transformed.append(text)

        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
class remove_stopwords(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords=stopwords):
        self.stopwords = stopwords

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            text_=''
            for word in nltk.tokenize.word_tokenize(text):
                if word not in self.stopwords:
                    text_ = ' '.join([text_,word])
            X_transformed.append(text_)

        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'text'])

In [None]:
# counting the number of words in a text
class wordCount(BaseEstimator, TransformerMixin):
    def __init__(self, stemming=True):
        self.stemming = stemming

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for text in X['text']:
            word_counts = Counter(text.split())
            
            if self.stemming:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
                
            X_transformed.append(word_counts)
            
        return pd.DataFrame(np.c_[X['author'], X_transformed], columns = ['author', 'word_counts'])

In [None]:
# transfering words to vectors
class toVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=10000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X['word_counts']:
            for word, count in word_count.items():
                # total_count[word] += count
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X['word_counts']):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        csr = csr_matrix((data, (rows, cols)), shape=(len(X), len(self.vocabulary_) + 1))
        df = pd.DataFrame.sparse.from_spmatrix(csr)
        df.insert(0,'author', X['author'])
        return df

In [None]:
class empath(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        empath = []
        for text in X['text']:
            empath.append(Empath().analyze(text, normalize=True))
        df = pd.DataFrame(empath)
        df.insert(0, 'author', X['author'])
        return df

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess

path = '/content/drive/MyDrive/data_23/'

groups = {'general': ['MentalHealthSupport','mentalhealth','mental','personalitydisorders',
                      'mentalillness','MentalHealthPH'],
          
          'control': ['askscience','askscience2','LifeProTips','LifeProTips2','AskReddit','AskReddit2',
                      'answers','answers2', 'AskScienceFiction','AskScienceFiction2','TrueAskReddit',
                      'TrueAskReddit2'],

          'adhd': ['ADHD','ADHD2'],

          'autism': ['aspergaers','autism2','AutisticQueers','AutismInWomen','Aspergers_Elders',
                     'asperger','AutisticPride','autism','AutismTranslated','aspergers_dating',
                     'aspergirls','AutisticAdults'],
          
          'anxiety': ['anxiety'],

          'ocd': ['OCD'],

          'ptsd': ['ptsd','CPTSD'],

          'phobia': ['Phobia','emetophobia','Agoraphobia'],

          'socialanxiety':['socialanxiety','socialanxiety2'],
          
          'depression': ['depression1','depression2','depression3'],

          'sadness': ['sad11','sad22','sad33'],
          
          'bipolar': ['bipolar','BipolarReddit'],
          
          'schizophrenia': ['schizophrenia','paranoidschizophrenia','schizoaffective','Psychosis'],

          'cluster_a': ['Schizoid','Schizotypal','ParanoidPersonality',
                        'Paranoid','ParanoiaCheck','Paranoia'],
          'cluster_b': ['BorderlinePDisorder','BPD','Borderline','hpd','NPD','narcissism',
                        'sociopath', 'psychopath','Psychopathy','aspd'],
          'cluster_c': ['OCPD','AvPD','Avoidant', 'DPD'],

          'selfharm': ['selfharm','StopSelfHarm','AdultSelfHarm',
                       'SuicideWatch11','SuicideWatch22','SuicideWatch33'],
          
          'addiction': ['addiction','alcoholism'],

          'eating': ['ARFID', 'bulimia','eating_disorders','EDAnonymous','EatingDisorders'],

          'dpdr': ['dpdr'],
          'dysmorphic': ['DysmorphicDisorder', 'BodyAcceptance'],
          'tourettes': ['Tourettes'],
          'anger': ['Anger'],
          }


In [None]:
w2v = pickle.load(open('/content/drive/MyDrive/model/w2v.pkl','rb'))

In [None]:
def vectorize(data, maxlen=100, embedding_dim=100):
    """
    Tokenizes reviews, truncates the number of tokens if more than `maxlen`, 
    and vectorizes each token. Returns a three-dimensional array of shape
    n reviews x `maxlen` x `embedding_dim`. 
    """
    # Create empty array
    vectorized_data = np.zeros(shape=(len(data), maxlen, embedding_dim))
    
    for row, case in enumerate(data):
        # Preprocess each review
        tokens = simple_preprocess(case)
        
        # Truncate long reviews
        if len(tokens) > maxlen:
            tokens = tokens[:maxlen]
        
        # Get vector for each token in review
        for col, token in enumerate(tokens):
            try:
                word_vector = w2v[token]
                # Add vector to array
                vectorized_data[row, col] = word_vector[:embedding_dim]
            except KeyError:
                pass
    
    return vectorized_data.reshape(len(data), -1)

In [None]:
clf = SGDClassifier()
classes = list(groups.keys())
files = os.listdir(path)
for f in files[:10]:
    df = pd.read_csv(path+f)
    df = df.dropna()
    while len(df)>1000:
        df_ = df.iloc[:1000]
        x_b = df_.x.values
        y_b = df_.y.values
        x_tr = vectorize(x_b)
        clf.partial_fit(x_tr, y_b, classes=classes)
        df = df.iloc[1000:]
    clf.partial_fit(x_tr, y_b, classes=classes)

f = files[10]
df= pd.read_csv(path+f)
while len(df)>1000:
    df_ = df.iloc[:1000]
    x_test = df_.x.values
    y_test = df_.y.values
    x_test_tr = vectorize(x_test)
    y_pred = clf.predict(x_test_tr)
    print(accuracy_score(y_test,y_pred))
    df = df.iloc[1000:]


In [None]:
for f in os.listdir(path)[:10]:
    df = pd.read_csv(path+f, lineterminator='\n', engine='c')
    x_tr = w2v.wv()

In [None]:
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==200: break
len(df)

1802816

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)

(1442252, 360564)

In [None]:
pipe = Pipeline([('vec', TfidfVectorizer()),
                 ('clf', MultinomialNB(alpha=.01))])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
pickle.dump(pipe,open('/content/drive/MyDrive/model/pipe_N200_L500_NBalpha01.pkl', 'wb'))
accuracy_score(y_test,y_pred)

0.25356940792758015

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess

path = '/content/drive/MyDrive/data_23/'

groups = {'general': ['MentalHealthSupport','mentalhealth','mental','personalitydisorders',
                      'mentalillness','MentalHealthPH'],
          
          'control': ['askscience','askscience2','LifeProTips','LifeProTips2','AskReddit','AskReddit2',
                      'answers','answers2', 'AskScienceFiction','AskScienceFiction2','TrueAskReddit',
                      'TrueAskReddit2'],

          'adhd': ['ADHD','ADHD2'],

          'autism': ['aspergaers','autism2','AutisticQueers','AutismInWomen','Aspergers_Elders',
                     'asperger','AutisticPride','autism','AutismTranslated','aspergers_dating',
                     'aspergirls','AutisticAdults'],
          
          'anxiety': ['anxiety'],

          'ocd': ['OCD'],

          'ptsd': ['ptsd','CPTSD'],

          'phobia': ['Phobia','emetophobia','Agoraphobia'],

          'socialanxiety':['socialanxiety','socialanxiety2'],
          
          'depression': ['depression1','depression2','depression3'],

          'sadness': ['sad11','sad22','sad33'],
          
          'bipolar': ['bipolar','BipolarReddit'],
          
          'schizophrenia': ['schizophrenia','paranoidschizophrenia','schizoaffective','Psychosis'],

          'cluster_a': ['Schizoid','Schizotypal','ParanoidPersonality',
                        'Paranoid','ParanoiaCheck','Paranoia'],
          'cluster_b': ['BorderlinePDisorder','BPD','Borderline','hpd','NPD','narcissism',
                        'sociopath', 'psychopath','Psychopathy','aspd'],
          'cluster_c': ['OCPD','AvPD','Avoidant', 'DPD'],

          'selfharm': ['selfharm','StopSelfHarm','AdultSelfHarm',
                       'SuicideWatch11','SuicideWatch22','SuicideWatch33'],
          
          'addiction': ['addiction','alcoholism'],

          'eating': ['ARFID', 'bulimia','eating_disorders','EDAnonymous','EatingDisorders'],

          'dpdr': ['dpdr'],
          'dysmorphic': ['DysmorphicDisorder', 'BodyAcceptance'],
          'tourettes': ['Tourettes'],
          'anger': ['Anger'],
          }


In [None]:
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==250: break
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)
pipe = Pipeline([('vec', TfidfVectorizer()),
                 ('clf', MultinomialNB(alpha=.01))])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
pickle.dump(pipe,open('/content/drive/MyDrive/model/pipe_N250_L500_NBalpha01.pkl', 'wb'))
accuracy_score(y_test,y_pred)

0.25008498810166574

In [None]:
from sklearn.ensemble import RandomForestClassifier
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==50: break
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)
pipe = Pipeline([('vec', TfidfVectorizer()),
                 ('clf', RandomForestClassifier())])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

accuracy_score(y_test,y_pred)

0.1908152054493518

In [None]:
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==400: break
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)
pipe = Pipeline([('vec', TfidfVectorizer()),
                 ('clf', MultinomialNB(alpha=.01))])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
accuracy_score(y_test,y_pred)

In [None]:
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==200: break
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)
pipe = Pipeline([('hash', HashingVectorizer(alternate_sign=False)),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB(alpha=.01))])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
accuracy_score(y_test,y_pred)

0.2467467634040004

In [None]:
df = pd.DataFrame(columns=['x','y'])
i=0
for f in os.scandir(path):
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
    i+=1
    if i==400: break
x_train, x_test, y_train, y_test = train_test_split(df.x.values,df.y.values, test_size=.2)
len(x_train), len(x_test)
pipe = Pipeline([('hash', HashingVectorizer(alternate_sign=False)),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB(alpha=.01))])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
accuracy_score(y_test,y_pred)

0.23536583358932725

In [None]:
i=0
for g in groups:
    files=[]
    for folder in groups[g]:
        files.extend(os.scandir(path+folder))
    c=0
    shuffle(files)
    for f in files:
        c+=1
        if c==17: break
        df = pd.read_csv(f, usecols=['text'], lineterminator='\n', engine='c')
        idx=[len(x)>150 for x in df.text]
        txt = df.text[idx]
        if i:
            X = np.concatenate([X, txt])
            y = np.concatenate([y,[g]*len(txt)])
        else:
            X = np.array(txt)
            y = np.array([g]*len(txt))
            i+=1

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.15)
vec = HashingVectorizer()
X_train_tr = vec.fit_transform(X_train)
X_test_tr = vec.transform(X_test)

pickle.dump(vec, open('/content/drive/MyDrive/model/vec.pkl', 'wb'))
pickle.dump(X_train_tr, open('/content/drive/MyDrive/model/X_train_tr.pkl', 'wb'))
pickle.dump(X_test_tr, open('/content/drive/MyDrive/model/X_test_tr.pkl', 'wb'))
pickle.dump(y_train, open('/content/drive/MyDrive/model/y_train.pkl', 'wb'))
pickle.dump(y_test, open('/content/drive/MyDrive/model/y_test.pkl', 'wb'))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
import pickle

# vec = pickle.load(open('/content/drive/MyDrive/model/vec.pkl', 'rb'))
X_train_tr = pickle.load(open('/content/drive/MyDrive/model/X_train_tr.pkl', 'rb'))
X_test_tr = pickle.load(open('/content/drive/MyDrive/model/X_test_tr.pkl', 'rb'))
y_train = pickle.load(open('/content/drive/MyDrive/model/y_train.pkl', 'rb'))
y_test = pickle.load(open('/content/drive/MyDrive/model/y_test.pkl', 'rb'))

In [None]:
clf = OneVsRestClassifier(SGDClassifier(), n_jobs=-1)

i = 0
k=['general','control','adhd','autism','anxiety','depression','sadness','bipolar','schizophrenia','anger',
 'cluster_a','cluster_b','cluster_c','selfharm','addiction','eating','dpdr','dysmorphic','tourettes']
while i <= len(y_train)//1000:
    clf.partial_fit(X_train_tr[i:i+1000], y_train[i:i+1000], classes=k)
    pickle.dump(clf, open('/content/drive/MyDrive/model/clf.pkl', 'wb'))
    pickle.dump(i, open('/content/drive/MyDrive/model/last_i.pkl', 'wb'))
    i += 1000

y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))

In [None]:
X_train_tr[i+3:i+4]

<1x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [None]:
vec = TfidfVectorizer()
X_train_tr = vec.fit_transform(X_train)
X_test_tr  = vec.transform(X_test)

In [None]:
clf = MultinomialNB(alpha=.01)
clf.fit(X_train_tr,y_train)

MultinomialNB(alpha=0.001)

In [None]:
y_pred = clf.predict(X_test_tr)

In [None]:
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   

Accuracy: 21.66%

Precision (micro): 21.66%
Recall (micro): 21.66%
F1 (micro): 21.66%

Precision (macro): 23.48%
Recall (macro): 22.77%
F1 (macro): 22.56%


In [None]:
pickle.dump(clf, open(path+'clf.pkl', 'wb'))
pickle.dump(vec, open(path+'vec.pkl', 'wb'))

In [None]:
xx = ["Wikipedia has received praise for its enablement of the democratization of knowledge, extent of coverage, unique structure, culture, and reduced degree of commercial bias; but criticism for exhibiting systemic bias, particularly gender bias against women and alleged ideological bias.[13][14] The reliability of Wikipedia was frequently criticized in the 2000s but has improved over time, as Wikipedia has been generally praised in the late 2010s and early 2020s.[3][13][15] The website's coverage of controversial topics such as American politics and major events like the COVID-19 pandemic has received substantial media attention. It has been censored by world governments, ranging from specific pages to the entire site. Nevertheless, Wikipedia has become an element of popular culture, with references in books, films, and academic studies. In April 2018, Facebook and YouTube announced that they would help users detect fake news by suggesting fact-checking links to related Wikipedia articles.[16][17] Articles on breaking news are often accessed as a source of frequently updated information about those events"]
xx_tr = vec.transform(xx)
clf.predict_proba(xx_tr)

array([[3.81853967e-10, 3.62916963e-05, 2.85033089e-10, 8.00661799e-18,
        1.26523161e-07, 8.90305929e-14, 5.77042579e-02, 1.22176247e-03,
        1.14104877e-09, 4.06144758e-15, 1.21280732e-10, 2.74370014e-44,
        9.41037549e-01, 9.31560767e-10, 4.73358119e-14, 9.73188602e-09,
        1.71139763e-23]])

In [None]:
labels = list(clf.classes_)
pred = list(clf.predict_proba(xx_tr)[0])

In [None]:
lst = sorted(zip(labels, pred), key=lambda x:-x[1])

In [None]:
pred = [y for x,y in lst]
labels = [x for x,y in lst]
labels

['aspergers',
 'OCPD',
 'Schizoid',
 'ARFID',
 'DysmorphicDisorder',
 'schizophrenia',
 'Tourettes',
 'depression1',
 'ADHD',
 'Anger',
 'anxiety',
 'MentalHealthSupport',
 'dpdr',
 'addiction',
 'BorderlinePDisorder',
 'selfharm',
 'askscience']

In [None]:
list(pred)

[array([3.81853967e-10, 3.62916963e-05, 2.85033089e-10, 8.00661799e-18,
        1.26523161e-07, 8.90305929e-14, 5.77042579e-02, 1.22176247e-03,
        1.14104877e-09, 4.06144758e-15, 1.21280732e-10, 2.74370014e-44,
        9.41037549e-01, 9.31560767e-10, 4.73358119e-14, 9.73188602e-09,
        1.71139763e-23])]

In [None]:
by_chance = 1/17 * 100
print('classifying by chance would result in {:.2f}\% accuracy.\nSo comparing to that, the model is doing a good job?!'.format(by_chance))
      

classifying by chance would result in 5.88\% accuracy.
So comparing to that, the model is doing a good job?!


In [None]:
clf = LogisticRegression()
clf.fit(X_train_tr,y_train)
y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   

In [None]:
clf = HistGradientBoostingClassifier()
clf.fit(X_train_tr,y_train)
y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   

In [None]:
clf = MLPClassifier()
clf.fit(X_train_tr,y_train)
y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   

In [None]:
clf = RandomforestClassifier()
clf.fit(X_train_tr,y_train)
y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   

In [None]:
clf = StackingClassifier(estimators=[('forest',  RandomforestClassifier()),
                                     ('gboost',  HistGradientBoostingClassifier()),
                                     ('MLP',     MLPClassifier()),
                                     ('passAgg', PassiveAggressiveClassifier())],
                         final_estimator=Ridge()

clf.fit(X_train_tr,y_train)
y_pred = clf.predict(X_test_tr)
print('Accuracy: {:.2f}%\n'.format(100*accuracy_score(y_test, y_pred)))
print('Precision (micro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='micro'))) 
print('Recall (micro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='micro')))  
print('F1 (micro): {:.2f}%\n'.format(100*f1_score(y_test, y_pred, average='micro'))) 
print('Precision (macro): {:.2f}%'.format(100*precision_score(y_test, y_pred, average='macro'))) 
print('Recall (macro): {:.2f}%'.format(100*recall_score(y_test, y_pred, average='macro'))) 
print('F1 (macro): {:.2f}%'.format(100*f1_score(y_test, y_pred, average='macro')))   