In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import pickle
from pathlib import Path
import codecs
import os

import re, nltk, spacy, gensim

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

#Setup data directory variable for global use
data_dir = Path.home() / "Desktop" / "bah-intermediate" / "CAPSTONE" / "data"
test_data = Path.home() / "Desktop" / "bah-intermediate" / "CAPSTONE" / "test_data"
viz_dir = Path.home() / "Desktop" / "bah-intermediate" / "CAPSTONE" / "visualizations"
model_dir = Path.home() / "Desktop" / "bah-intermediate" / "CAPSTONE" / "models"

In [2]:
model = pickle.load(open(model_dir / "optimal_fit_model_5topics_070321212952.pickle", 'rb'))
data_vectorized = pickle.load(open(model_dir / "optimal_dtm_5topics_070321212952.pickle", 'rb'))
vectorizer = pickle.load(open(model_dir / "optimal_vec_5topics_070321212952.pickle", 'rb'))
df_document_topic = pickle.load(open(model_dir / "df_document_topic_070321212946.pickle", 'rb'))
df_topic_keywords = pickle.load(open(model_dir / "df_topic_keywords_070321212946.pickle", 'rb'))

In [3]:
def get_text(file_path):
    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
            
    return text

In [4]:
#Process the text, takes 2 arguments: text and language; language defaults to english if none passed
def process_text(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text1 = re.sub(r'\S*@\S*\s?', '', text)
    #print(text1)
    text2 = re.sub(r'\s+', ' ', text1)
    #print(text2)
    text3 = re.sub(r"\'", "", text2)
    #print(text3)
    
    text4 = list(gensim.utils.simple_preprocess(str(text3), min_len=2, max_len=15, deacc=True))
    
    text5 = nlp(' '.join(text4))
    
    text_out = []
    text_out.append(' '.join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' 
                              for token in text5 if token.pos_ in allowed_postags]))
    
    
    return text_out 

In [5]:
text = get_text('./test_data/test_article10.txt')
text2 = process_text(text)

text2

['air vice marshal rich maddison be senior raf officer decade fly experience air force be as high tech get be just be refer miniature computer black lime green screen miniscule memory use aa battery power design be psion device avm maddison represent personal aviation history date device be where keep own fly log hail era when computer come own programming language psion invite user tinker limited application take field address book convert resemble pilot logbook pilot record flight column list such detail date aircraft type crew name purpose flight route fly avm maddison be also issue physical logbook psion allow build automatic monthly summary flying hour be use tell how many hour d achieve particular aircraft colleague today raf use own advanced fly program compile datum lack personal bond avm maddison have psion m third physical logbook s get sign year get lot more psion psion image last month rich maddison psion aa battery only need replace couple month so do have worry recharge c

In [6]:
text3 = vectorizer.transform(text2)

In [7]:
# Step 4: LDA Transform
topic_probability_scores = model.transform(text3)

topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()

# Step 5: Infer Topic
infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]

In [8]:
topic_probability_scores

array([[0.01694044, 0.2369161 , 0.01680271, 0.01728291, 0.71205784]])

In [9]:
infer_topic

'technology'

In [10]:
topic

['mobile phone',
 'chief executive',
 'tell bbc',
 'say would',
 'bbc news',
 'tell bbc news',
 'camera phone',
 'new york',
 'news website',
 'technology']

In [None]:
df_document_topic.dominant_topic.value_counts()