In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import config
import util
import mlutils
import numpy as np
import predictions.models as models
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from brownclustering import brownclusters as bc
import mlutils
import wordcloud as wc
from PIL import Image
import generate_wordclouds
bc.load()

In [None]:
bow = Pipeline([
    ('selector', mlutils.ColumnSelector('tweets')),
    ('transformer', TfidfVectorizer(min_df=0.01, tokenizer=mlutils.tokenize_only_alphanumeric_tokens))
])

clusters = Pipeline([
    ('selector', mlutils.ColumnSelector('tweets')),
    ('transformer', TfidfVectorizer(min_df=0.01, tokenizer=bc.tokenize_and_tag))
])

cmu = Pipeline([
    ('selector', mlutils.POSTagColumnSelector('cmu_pos_tags')),
    ('transformer', TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))
])

nltk = Pipeline([
    ('selector', mlutils.POSTagColumnSelector('nltk_pos_tags')),
    ('transformer', TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))
])

In [None]:
def generate_wordcloud(words, sizes, colors):
    mask = Image.open('mask_big.png')
    colors = np.array(colors)
    colors = colors / max(colors)
    word_freqs = {words[i]: sizes[i] for i in range(len(words))}
    word_colors = {words[i]: colors[i] for i in range(len(words))}

    def color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
        x = word_colors[word]
        # v = 128 + (127 / max(abs(colors)) ** 3) * np.power(x, 3)
        v = (x - min(colors))/(max(colors) - min(colors))*255
        return "rgb(%d,%d,%d)" % (v, 0, 255 - v)

    cloud = wc.WordCloud(mask=np.array(mask), prefer_horizontal=1.0, color_func=color_func,
                         background_color='white')
    img = cloud.generate_from_frequencies(word_freqs)
    img.to_image().show()
    img.to_file('ptsd_clusters.png')

In [None]:
def compute_stats(pipeline, df):
    Y = np.array(df['labels'].astype(int))
    X_tfidf = np.array(pipeline.fit_transform(df).todense())
    transformer = pipeline.named_steps['transformer']
    vocab = transformer.vocabulary_
    transformer.use_idf = False
    transformer.vocabulary = vocab
    X_tf = np.array(pipeline.transform(df).todense())
    cohensd = mlutils.compute_cohensd(X_tf, Y)
    stats = mlutils.compute_pvals(X_tf, Y)
    pvals = [r.pvalue for r in stats]
    infogain = mlutils.compute_infogain(X_tfidf, Y)
    features = transformer.get_feature_names()
    return pd.DataFrame(data=np.array([infogain, cohensd, pvals]).T, columns=['infogain', 'cohensd', 'pvals'], index=features)

In [None]:
ctrl_ptsd = pd.concat([util.load_picke_file(config.CTRL_PTSD_HELD_OUT_FILTERED_DF), util.load_picke_file(config.CTRL_PTSD_FILTERED_DF)])

In [None]:
ctrl_depr = pd.concat([util.load_picke_file(config.CTRL_DEPR_HELD_OUT_FILTERED_DF), util.load_picke_file(config.CTRL_DEPR_FILTERED_DF)])

In [None]:
clusters_f_df = compute_stats(clusters, ctrl_depr)
sig_df = clusters_f_df.loc[clusters_f_df['pvals'] < 0.05 / len(clusters_f_df)]
generate_wordclouds.cluster_usage_word_cloud(ctrl_depr, sig_df)

In [None]:
bow_f_df = compute_stats(bow, ctrl_depr)
sig_bow_df = bow_f_df.loc[bow_f_df['pvals'] < 0.05 / len(bow_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_bow_df.to_csv('sig_features/ctrl_depr_bow_sig_features.csv')
bow_f_df.sort_values(['cohensd', 'infogain'], ascending=False).to_csv('sig_features/ctrl_depr_bow_all_features.csv')

In [None]:
cmu_f_df = compute_stats(cmu, ctrl_df)
sig_cmu_df = cmu_f_df.loc[cmu_f_df['pvals'] < 0.05 / len(cmu_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_cmu_df.to_csv('sig_features/ctrl_depr_cmu_sig_features.csv')
cmu_f_df.to_csv('sig_features/ctrl_depr_cmu_all_features.csv')

In [None]:
nltk_f_df = compute_stats(nltk, ctrl_df)
sig_nltk_df = nltk_f_df.loc[nltk_f_df['pvals'] < 0.05 / len(nltk_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_nltk_df.to_csv('sig_features/ctrl_depr_nltk_sig_features.csv')
nltk_f_df.to_csv('sig_features/ctrl_depr_nltk_all_features.csv')

In [None]:
generate_wordcloud(sig_bow_df.index.values, sig_bow_df['infogain'], sig_bow_df['cohensd'])

In [None]:
clusters_f_df = compute_stats(clusters, ctrl_ptsd)
sig_df = clusters_f_df.loc[np.logical_and(clusters_f_df['pvals'] < 0.05 / len(clusters_f_df), np.logical_or(clusters_f_df['cohensd'] > .4, clusters_f_df['cohensd'] < 0))]
generate_wordclouds.cluster_usage_word_cloud(ctrl_ptsd, sig_df)

In [None]:
bow_f_df = compute_stats(bow, ctrl_ptsd)
sig_bow_df = bow_f_df.loc[bow_f_df['pvals'] < 0.05 / len(bow_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_bow_df.to_csv('sig_features/ctrl_ptsd_bow_sig_features.csv')
bow_f_df.sort_values(['cohensd', 'infogain'], ascending=False).to_csv('sig_features/ctrl_ptsd_bow_all_features.csv')

In [None]:
cmu_f_df = compute_stats(cmu, ctrl_ptsd)
sig_cmu_df = cmu_f_df.loc[cmu_f_df['pvals'] < 0.05 / len(cmu_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_cmu_df.to_csv('sig_features/ctrl_ptsd_cmu_sig_features.csv')
cmu_f_df.to_csv('sig_features/ctrl_ptsd_cmu_all_features.csv')

In [None]:
nltk_f_df = compute_stats(nltk, ctrl_ptsd)
sig_nltk_df = nltk_f_df.loc[nltk_f_df['pvals'] < 0.05 / len(nltk_f_df)].sort_values(['cohensd', 'infogain'], ascending=False)
sig_nltk_df.to_csv('sig_features/ctrl_ptsd_nltk_sig_features.csv')
nltk_f_df.to_csv('sig_features/ctrl_ptsd_nltk_all_features.csv')

In [None]:
generate_wordcloud(sig_bow_df.index.values, sig_bow_df['infogain'], sig_bow_df['cohensd'])