In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from random import shuffle
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier,LogisticRegression
import gensim
import gensim.downloader as gensim_api

path = '/content/drive/MyDrive/data_23/'

groups = {'general': ['MentalHealthSupport','mentalhealth','mental','personalitydisorders',
                      'mentalillness','MentalHealthPH'],
          
          'control': ['askscience','askscience2','LifeProTips','LifeProTips2','AskReddit','AskReddit2',
                      'answers','answers2', 'AskScienceFiction','AskScienceFiction2','TrueAskReddit',
                      'TrueAskReddit2'],

          'adhd': ['ADHD','ADHD2'],

          'autism': ['aspergers','autism2','AutisticQueers','AutismInWomen','Aspergers_Elders',
                     'asperger','AutisticPride','autism','AutismTranslated','aspergers_dating',
                     'aspergirls','AutisticAdults'],
          
          'anxiety': ['anxiety'],

          'ocd': ['OCD'],

          'ptsd': ['ptsd','CPTSD'],

          'phobia': ['Phobia','emetophobia','Agoraphobia'],

          'socialanxiety':['socialanxiety','socialanxiety2'],
          
          'depression': ['depression1','depression2','depression3'],

          'sadness': ['sad11','sad22','sad33'],
          
          'bipolar': ['bipolar','BipolarReddit'],
          
          'schizophrenia': ['schizophrenia','paranoidschizophrenia','schizoaffective','Psychosis'],

          'cluster_a': ['Schizoid','Schizotypal','ParanoidPersonality',
                        'Paranoid','ParanoiaCheck','Paranoia'],
          'cluster_b': ['BorderlinePDisorder','BPD','Borderline','hpd','NPD','narcissism',
                        'sociopath', 'psychopath','Psychopathy','aspd'],
          'cluster_c': ['OCPD','AvPD','Avoidant', 'DPD'],

          'selfharm': ['selfharm','StopSelfHarm','AdultSelfHarm',
                       'SuicideWatch11','SuicideWatch22','SuicideWatch33'],
          
          'addiction': ['addiction','alcoholism'],

          'eating': ['ARFID', 'bulimia','eating_disorders','EDAnonymous','EatingDisorders'],

          'dpdr': ['dpdr'],
          'dysmorphic': ['DysmorphicDisorder', 'BodyAcceptance'],
          'tourettes': ['Tourettes'],
          'anger': ['Anger'],
          }

nlp = gensim_api.load("word2vec-google-news-300")

def embedding_feats(list_of_sents):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for sents in list_of_sents:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero 
        for token in sents.split():
            if token in nlp:
                feat_for_this += nlp[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this) 
        else:
            feats.append(zero_vector)
    return feats




In [None]:
os.chdir(path)
i=1
lst=os.listdir()
shuffle(lst)
embed = FunctionTransformer(embedding_feats)
clf = SGDClassifier()

df = pd.DataFrame(columns=['x','y'])
for f in lst[:20]:
    df_ = pd.read_csv(f, lineterminator='\n', engine='c')
    df_ = df_[[len(i)>500 for i in df_.x]]
    df = pd.concat([df,df_], ignore_index=True)
y_test = df.y.values
x_test = embed.fit_transform(df.x.values)
del df, df_

for i,f in enumerate(lst[20:]):
    df = pd.read_csv(f, lineterminator='\n', engine='c')
    df = df[[len(i)>500 for i in df.x]]
    x_train = embed.fit_transform(df.x.values)
    clf.partial_fit(x_train, df.y.values, classes=list(groups.keys()))
    s = clf.score(x_test, y_test)
    print(i, ', score:', s)
    if not i%1000:
        pickle.dump(clf,open('/content/drive/MyDrive/model/SGD_N'+str(i)+'_L500_W2v_'+str(round(s,4))+'.pkl', 'wb'))
    
pickle.dump(clf,open('/content/drive/MyDrive/model/SGD_N'+str(i)+'_L500_W2v_'+str(round(s,4))+'.pkl', 'wb'))

0 , score: 0.11650638928867373
1 , score: 0.13601770683414016
2 , score: 0.1490961228139038
3 , score: 0.14148022155349121
4 , score: 0.14495834968816781
5 , score: 0.15078612237777486
6 , score: 0.13365715905621703
7 , score: 0.09955187753499935
8 , score: 0.08299533342055912
9 , score: 0.1276549347987265
10 , score: 0.03092699201884077
11 , score: 0.13840006105804875
12 , score: 0.021681058921017054
13 , score: 0.07570107723843168
14 , score: 0.11228139037899604
15 , score: 0.12533254830127785
16 , score: 0.14390073705787432
17 , score: 0.11590671202407432
18 , score: 0.05599350167909634
19 , score: 0.14283767281608442
20 , score: 0.15219263814383532
21 , score: 0.13717344847136814
22 , score: 0.04542282698765755
23 , score: 0.13512909416023378
24 , score: 0.07979523747219679
25 , score: 0.15648850800296568
26 , score: 0.1602937328274238
27 , score: 0.056402372541323215
28 , score: 0.07559749662000087
29 , score: 0.07651881896288543
30 , score: 0.15543089537267216
31 , score: 0.03134