In [1]:
import re
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from utils.calculate_pmi_features import get_data
from utils.textnormalization import split_on_word, normalize
from utils.happyfuntokenizing import Tokenizer
from utils.nonnegative_matrix_factorization import nmf_inspect, nmf_labels

plt.style.use('ggplot')
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'axes.axisbelow' : True,
          'lines.antialiased' : True}

for (k, v) in params.items():
    plt.rcParams[k] = v

In [3]:
def sentences(s_list):
    words = split_on_word(s_list)
    words_norm = normalize(words)
    return [' '.join(s) for s in words_norm]

In [4]:
df = get_data()

In [5]:
TAG_RE = re.compile(r'<[^>]+>')

essays = ['essay' + str(i) for i in range(10)]
for e in essays:
    df[e] = df[e].replace(np.nan, '' , regex=True)    \
                 .replace('\n', ' ')                  \
                 .apply(lambda x: TAG_RE.sub(' ', x)) \
                 .apply(lambda x: re.sub('\s+', ' ', x).strip())

df.fillna('', inplace=True)
df['ethnicity_'] = df.ethnicity.apply(lambda x: 'multi' if ',' in x else x)
df['token_count'] = df.TotalEssays.str.split().str.len()

In [6]:
df_token_threshold = df[df.token_count >= 100]
df_token_threshold.reset_index(drop=True, inplace=True)

In [7]:
essay = 'essay4'

In [8]:
sents = sentences(df_token_threshold[essay].tolist())

In [9]:
tfidf = TfidfVectorizer(stop_words='english',
                        tokenizer=Tokenizer().tokenize,
                        sublinear_tf=True,
                        min_df=0.01, max_df=0.5)
data = tfidf.fit_transform(sents)

## NMF

### Inspect

In [10]:
tfidf_feature_names = tfidf.get_feature_names()

In [11]:
nmf_inspect(data, tfidf_feature_names)

3
Group 0:
favorite love movie book italian mexican rock thai hop hip

Group 1:
like i'm read don't love really good watch lot ...

Group 2:
books amp shows ... men dead black big game family


5
Group 0:
like i'm read don't really love watch good lot reading

Group 1:
books shows men amp dead black game big family bad

Group 2:
... .... list know love yes books ask goes say

Group 3:
love italian rock mexican thai hop hip jazz amp indian

Group 4:
favorite movie book time love shows include band foods probably


7
Group 0:
like i'm read don't really lot watch good i've reading

Group 1:
books shows men amp dead black game big family bad

Group 2:
rock italian mexican thai hop indian hip amp jazz chinese

Group 3:
... .... know yes books say good ..... little ask

Group 4:
love kinds new types eat cook live books good enjoy

Group 5:
list long way goes ask i'll favorites .... things far

Group 6:
favorite movie book time shows include band foods probably series


9
Group 0:
like i'm re

### Actual

In [12]:
n_topics = 9

In [13]:
labels = nmf_labels(data, n_topics)

In [14]:
labels

array([6, 7, 0, ..., 3, 2, 2])

In [15]:
df_token_threshold['labels'] = labels

In [16]:
print('Average Token Count by Cluster (Topic)')
df_token_threshold.groupby('labels')['token_count'].mean()

Average Token Count by Cluster (Topic)


labels
0    428.523290
1    489.737341
2    365.301708
3    374.756663
4    370.328346
5    347.436265
6    368.591274
7    410.525881
8    365.706466
Name: token_count, dtype: float64

In [17]:
print('Distribution of Samples by Cluster (Topic)')
pd.DataFrame(labels)[0].value_counts().sort_index()

Distribution of Samples by Cluster (Topic)


0    13761
1    11810
2     4859
3     3452
4     3609
5     3491
6     5226
7     2299
8     2320
dtype: int64