In [1]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [52]:
df = pd.read_csv('labelled.csv')[['report', 'label']]

print(df.shape)
df.head()

(5824, 2)


Unnamed: 0,report,label
0,Heimlich is a Level 1 sex offender and wouldn'...,0
1,Alexy made headlines for all the wrong reasons...,0
2,"The Nationals have acquired Cole twice, first ...",1
3,Signed for an above-slot $2 million as a Natio...,1
4,"It often takes time for those high-ceilinged, ...",0


In [114]:
pos_docs = df[df.label == 1].report
neg_docs = df[df.label == 0].report

In [124]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(sklearn.base.BaseEstimator, VectorizerMixin)
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetc

In [125]:
from sklearn.pipeline import make_pipeline

topic_model = make_pipeline(
    CountVectorizer(max_df=0.9, min_df=2, ngram_range=(1,3)),#, stop_words='english'),
    LatentDirichletAllocation(n_components=10, random_state=42)
).fit(df.report)

In [118]:
def corpus2vec(docs, transform=np.mean):
    topics = topic_model.transform(docs)
    return transform(topics, axis=0)

def closest_topics(docs, n_topics=5):
    similar = corpus2vec(docs).argsort()[-n_topics:]
    for n in similar:
        print(fetch_topic(n))
        
        
def fetch_topic(n):
    params = topic_model.get_params()
    lda = params['latentdirichletallocation']
    cv = params['countvectorizer']
    
    topic_ids = lda.components_[n].argsort()[-10:]
    return [cv.get_feature_names()[i] for i in topic_ids]

In [126]:
print('\nPositive\n---\n')
closest_topics(pos_docs)


print('\nNegative\n---\n')
closest_topics(neg_docs)


Positive
---

['mph', 'that', 'fastball', 'him', 'at', 'for', 'in the', 'has', 'as', 'with']
['it', 'but', 'him', 'at', 'for', 'in the', 'has', 'that', 'with', 'as']
['an', 'but', 'that', 'in the', 'him', 'for', 'has', 'with', 'at', 'as']
['fastball', 'at', 'it', 'that', 'him', 'for', 'has', 'as', 'in the', 'with']
['but', 'he has', 'power', 'him', 'in the', 'with', 'as', 'for', 'at', 'has']

Negative
---

['mph', 'that', 'fastball', 'him', 'at', 'for', 'in the', 'has', 'as', 'with']
['it', 'but', 'him', 'at', 'for', 'in the', 'has', 'that', 'with', 'as']
['an', 'but', 'that', 'in the', 'him', 'for', 'has', 'with', 'at', 'as']
['fastball', 'at', 'it', 'that', 'him', 'for', 'has', 'as', 'in the', 'with']
['but', 'he has', 'power', 'him', 'in the', 'with', 'as', 'for', 'at', 'has']
