# TWEEM.IO SIMILARITY MODEL

## Model: Multinomial Naive Bayes Text Classifier: v1.0

Uses Multinomial Naive Bayes Classifier  
Corpus is full list of tweets from group / category  
Tweets can be downloaded via a different notebook, and persisted via a different notebook  
Tweet library is vectorized via count vectorizer or td-idf  
Use of NLP (spacy) to remove stop words, lemmatization, etc during training phases  
Data persisted to local FS.  We can use local info  

### Imports

In [3]:
#  Change to project base to allow assembly framework to init properly
import os
os.chdir('..')

In [1]:
import twitter
import gzip
import re
import json
import itertools
import functools
# import spacy
# import spacy_readability
import string
import collections
import datetime
import importlib
import pandas as pd
import numpy as np
import scipy
import pickle

In [3]:
#  Scikit-learn imports
import sklearn
from sklearn import tree as skl_tree
from sklearn import metrics as skl_metrics
from sklearn import model_selection as skl_model_selection
from sklearn import linear_model as skl_linear
from sklearn import naive_bayes as skl_naive_bayes
from sklearn import preprocessing as skl_preprocess
from sklearn import feature_extraction as skl_ftex

In [5]:
import assembly
from assembly import config as asmbl_config
from assembly import db as asmbl_db
from assembly import models as asmbl_models

from lib.tweemio import twm
from lib.tweemio import similarity
from lib.tweemio import twitter

### Inputs / Settings

Settings controlling model calibration

In [12]:
group = 'default'  # name of group (default)
condense_factor = 2   #  Tweets to combine into a single observation for model training purposes
tline_recents = 400  #  How many recent tweets to evaluate in model
tline_suffix = '20191224'  #  Last time tweet timeline was pulled 
tline_save_location = '/Users/liangjh/Workspace/tweemio-api/data/timelines'  # Timeline persistence location
model_persist_dir = '/Users/liangjh/Workspace/tweemio-api/data/models'  #  Model persistence location

In [6]:
#  Assembly initialization - will use configs defined for env in question
os.environ['ASSEMBLY_ENV'] = 'Development'
app = assembly.Assembly.init(__name__, {'default': []})

In [9]:
#  Twitter screen names to evaluate / calibrate (depends on the group)
twitter_handles = app.config['SIMILARITY_COMPARISONS'][group]['screen_names']

### Load Saved Tweet Timelines

In [13]:
#  All timeline JSON saved to file system (as of last retrieval)
#  Load / parse for extraction
timeline_map = {}
for screen_name in twitter_handles:
#     print(f'Loading {screen_name}')
    fileloc = f'{tline_save_location}/{screen_name}-{tline_suffix}.json'
    with open(fileloc, 'r') as infile:
        content = json.load(infile)
        timeline_map[screen_name] = content

In [None]:
timeline_text = {screen_name: list(reversed([tli['full_text'] for tli in tline])) 
                 for screen_name, tline in timeline_map.items()}
timeline_text = {screen_name:tl[-tline_recents:] for screen_name,tl in timeline_text.items()}

### Construct Tweet Timeline

In [None]:
#  Remove non-parseable patterns ** adjustable **
def timeline_text_construct(timeline_text: list, filter_regex: str='^(http)', condense_factor: int=1) -> list:
    '''
    Constructs timeline text based on filter regex as well as condense factor
    (i.e. number of tweets to compress together)
    '''
    timeline_text = [' '.join([('' if (re.search(filter_regex, word) != None) else word) for word in text.split()]) for text in timeline_text]
    timeline_text = [t for t in timeline_text if len(t.strip()) > 0]

    #  Group by condense factor (i.e. grouping multiple tweets into single tweet)
    timeline_text = [
        ' '.join(timeline_group) for timeline_group in 
        zip(*[timeline_text[n::condense_factor] for n in range(0, condense_factor)])
    ]
    
    return timeline_text

In [None]:
#  Combine all timelines into a single dataframe, for feature extraction
tl_dfs = []
for screen_name, tline in timeline_text.items():
    print(f'Processing {screen_name}...')

    #  Parse tweet list: full text, nomention and mention-only
    tline_text = timeline_text_construct(tline, '^(http)', condense_factor)
    tline_text_nomention = [' '.join([('' if (re.match(r'^(@|#|http)', word) != None) else word) for word in text.split()]).strip() for text in tline_text]    
    tline_text_mention   = [' '.join([('' if (re.match(r'^(@|#)', word) == None) else word) for word in text.split()]).strip() for text in tline_text]
    
    tl_df = pd.DataFrame({'screenname': screen_name, 'text': tline_text, 'text_nomention': tline_text_nomention, 'text_mention': tline_text_mention})
    tl_dfs.append(tl_df)

feature_df = pd.concat(tl_dfs)

### Evaluate Multinomial NB Model

In [None]:
feature_short_df = feature_df[['screenname', 'text', 'text_nomention', 'text_mention']]

#  As we are evaluating the full dataframe, we can calibrate the word vector once 
x = feature_short_df.text_nomention
vect = skl_ftex.text.CountVectorizer().fit(x)

#  Test accuracy of classification for each 
trained_models = {}
for screen_name in twitter_handles:
    print(f'Evaluating SN: {screen_name}')

    #  For each screen name, its either in our out (binary)
    y = feature_short_df.apply(lambda r: True if r['screenname'] == screen_name else False, axis=1)    
    #  1-gram count vectorization on x
    x_train_dtm = vect.transform(x_train)

    #  Fit model 
    model_nb = skl_naive_bayes.MultinomialNB()
    model_nb.fit(x_train_dtm, y_train)
    trained_models[screen_name] = model_nb

### Persistence / Save Trained Models

Save to local file system.  
Another notebook will handle saving to database.

In [None]:
pickle.dump(vect, open(f'{model_persist_dir}/vectorizer.pik', 'wb'))
for screen_name, model_nb in trained_models.items():
    pickle.dump(model_nb, open(f'{model_persist_dir}/model-{screen_name}.pik', 'wb'))