In [2]:
import numpy as np
import os
import json
import pickle
import itertools
from tqdm import tqdm
import glob
import pandas as pd

C:\Users\habu8\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\habu8\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [3]:
from transformers import AutoTokenizer, DistilBertTokenizer
from transformers import AutoModelForTokenClassification, DistilBertForSequenceClassification
from transformers import pipeline

from sentence_transformers import SentenceTransformer, util

from scipy import spatial
tqdm.pandas()

In [4]:
#Read in files; download from S3 handled via cmd line
def read_files(path):
    files_glob=glob.glob(path)
    file_collection=[]
    for f in files_glob:
        file_collection.append(pd.read_parquet(f))
    file_collection=pd.concat(file_collection,axis=0)
    return file_collection

path=os.getcwd()

news_files=path+'/news/*.parquet'
news=read_files(news_files)
print(news.shape)

social_files=path+'/social/*.parquet'
social=read_files(social_files)
print(social.shape)

blog_files=path+'/blog/*.parquet'
blog=read_files(blog_files)
print(blog.shape)

(132757, 12)
(85665, 12)
(2726, 10)


In [5]:
#Not all records have text in their bodies but their headlines/titles can still hold useful info
news['all_text']=news.apply(lambda x: f"{str(x['headline'])}. {str(x['summary'])}. {str(x['body'])}", axis=1)
social['all_text']=social.apply(lambda x: f"{str(x['title'])}. {str(x['text'])}", axis=1)
blog['all_text']=blog.apply(lambda x: f"{str(x['title'])}. {str(x['body'])}", axis=1)

In [8]:
#get some information on text length
def count_text_elements(df, text_col, tokenizer):
    df['word_count']=df[text_col].progress_map(lambda x: len(x.split(' ')))
    df['sent_count']=df[text_col].progress_map(lambda x: len(x.split('.')))
    df['token_count']=df[text_col].progress_map(lambda x: len(tokenizer(x)['input_ids']))
    return df

mpnet_tokenizer=AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

news=count_text_elements(news, 'all_text', mpnet_tokenizer)
social=count_text_elements(social, 'all_text', mpnet_tokenizer)
blog=count_text_elements(blog, 'all_text', mpnet_tokenizer)

minilm_tokenizer=AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', max_length=384)

100%|██████████| 85665/85665 [00:00<00:00, 245007.39it/s]
100%|██████████| 85665/85665 [00:00<00:00, 723690.41it/s]
  0%|          | 0/85665 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 85665/85665 [00:24<00:00, 3487.46it/s]
100%|██████████| 2726/2726 [00:00<00:00, 11071.86it/s]
100%|██████████| 2726/2726 [00:00<00:00, 53430.62it/s]
100%|██████████| 2726/2726 [00:22<00:00, 122.91it/s]


### NER processing

In [9]:
#Leverage NER to identify records dealing with individual airlines and the industry in general
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy='simple', device=0)

In [10]:
def ner_scan(texts_list, tokenizer, model, aggregation_strategy='simple', chunk_size=100, outfile=None, device=-1):
    ner_results = []
    chunk_size=chunk_size
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation_strategy, device=device)
    for chunk in tqdm(range(len(texts_list) // chunk_size + 1), position=0, leave=True):
        texts = texts_list[chunk_size * chunk: min(chunk_size * (chunk+1),len(texts_list))]
        res = ner_pipeline(texts)
        ner_results += res
    if outfile != None:
        with open(outfile,'wb') as f:
            pickle.dump(ner_results, f)
    return ner_results

news_text=news['all_text'].to_list()
news['ner_results']=ner_scan(news_text, ner_tokenizer, ner_model, device=0)
news['entities']=news['ner_results'].map(lambda x: [(y['word'],y['entity_group'],y['score']) for y in x])
news['orgs']=news['entities'].map(lambda x: [y[0] for y in x if y[1]=='ORG'])

social_text=social['all_text'].to_list()
social['ner_results']=ner_scan(social_text, ner_tokenizer, ner_model, device=0)
social['entities']=social['ner_results'].map(lambda x: [(y['word'],y['entity_group'],y['score']) for y in x])
social['orgs']=social['entities'].map(lambda x: [y[0] for y in x if y[1]=='ORG'])

blog_text=blog['all_text'].to_list()
blog['ner_results']=ner_scan(blog_text, ner_tokenizer, ner_model, device=0)
blog['entities']=blog['ner_results'].map(lambda x: [(y['word'],y['entity_group'],y['score']) for y in x])
blog['orgs']=blog['entities'].map(lambda x: [y[0] for y in x if y[1]=='ORG'])

100%|██████████| 857/857 [11:08<00:00,  1.28it/s]
100%|██████████| 28/28 [01:14<00:00,  2.65s/it]


In [23]:
#Embeddings for search 
#--- we are doing this in order to avoid having to compute embeddings for each record, one at a time during the search process 
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

news_orgs=list(itertools.chain.from_iterable(news['orgs'].to_list()))
blog_orgs=list(itertools.chain.from_iterable(blog['orgs'].to_list()))
social_orgs=list(itertools.chain.from_iterable(social['orgs'].to_list()))
orgs=list(set(news_orgs+blog_orgs+social_orgs))
org_embeddings={}
chunk_size=100
for chunk in tqdm(range(len(orgs) // chunk_size + 1), position=0, leave=True):
    texts = orgs[chunk_size * chunk: min(chunk_size * (chunk+1),len(orgs))]
    for org in texts:
        org_embeddings[org]=embedding_model.encode(org)
with open('org_embeddings.pickle','wb') as f:
    pickle.dump(org_embeddings, f)

100%|██████████| 2494/2494 [29:23<00:00,  1.41it/s]


In [12]:
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

us_domestic_airlines=['united airlines',
'southwest airlines',
'american airlines',
'spirit airlines',
'delta air lines',
'frontier airlines',
'alaska airlines',
'delta air lines',
'hawaiian airlines',
'jetblue airways',
'allegiant air',
'envoy air',
'republic airways',
'skywest airlines']
us_domestic_airlines_embeddings=[embedding_model.encode(x) for x in us_domestic_airlines]

with open('org_embeddings.pickle','rb') as f:
    org_embeddings=pickle.load(f)

#Run a comparison between all orgs identified by the NER search and curated list of domestic airline names
#Leverage embedings to avoid pitfalls of substring searches
def search_for_airlines(entities_list,airline_embeddings):
    airline_embeddings=airline_embeddings
    orgs=[x[0] for x in entities_list if x[1]=='ORG']
    max_score=0
    max_score_org=''
    exact_matches=[]
    for org in orgs:
        org_embed=org_embeddings[org]
        for i in range(len(airline_embeddings)):
            score=util.pytorch_cos_sim(airline_embeddings[i], org_embed)[0][0].item()
            if score>0.99:
                exact_matches.append(airline_embeddings[i])
            if score>max_score:
                max_score_org=org
                max_score=score
    return (np.round(max_score,2), max_score_org, exact_matches)

news['airline_search']=news['entities'].progress_map(lambda x: search_for_airlines(x, us_domestic_airlines_embeddings))
news['airline_match_score']=news['airline_search'].map(lambda x: x[0])
news['best_match_org']=news['airline_search'].map(lambda x: x[1])

blog['airline_search']=blog['entities'].progress_map(lambda x: search_for_airlines(x, us_domestic_airlines_embeddings))
blog['airline_match_score']=blog['airline_search'].map(lambda x: x[0])
blog['best_match_org']=blog['airline_search'].map(lambda x: x[1])

social['airline_search']=social['entities'].progress_map(lambda x: search_for_airlines(x, us_domestic_airlines_embeddings))
social['airline_match_score']=social['airline_search'].map(lambda x: x[0])
social['best_match_org']=social['airline_search'].map(lambda x: x[1])

#Make identifier column for each US carrier to assist with later analysis and slicing
for airline in us_domestic_airlines:
   # news[airline]=news['airline_search'].map(lambda x: 1 if airline in x[1].lower() else 0)
    blog[airline]=blog['airline_search'].map(lambda x: 1 if airline in x[1].lower() else 0)
    social[airline]=social['airline_search'].map(lambda x: 1 if airline in x[1].lower() else 0)

100%|██████████| 2726/2726 [00:12<00:00, 224.12it/s]
100%|██████████| 85665/85665 [02:36<00:00, 546.31it/s]


In [28]:
print(news.shape)
print(social.shape)
print(blog.shape)

(132757, 35)
(85665, 35)
(2726, 33)


In [22]:
#data save checkpoint
with open('news.pickle', 'wb') as f:
    pickle.dump(news, f)
with open('blog.pickle', 'wb') as f:
    pickle.dump(blog, f)
with open('social.pickle', 'wb') as f:
    pickle.dump(social,f)

### Embeddings for downstream

In [13]:
#Get a smaller embedding space for quicker compute downstream
minilm_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
mpnet_embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [31]:
#Filter out non-airline related records and sources
exclude_news_source=['MDPI',#peer-reviewd journal
                     'Nature',#peer-reviewd journal
                     'Moviebill',#? movie time listing? but the articles don't seem to line up
                     'legacy.com',#obituaries website
                     'thirstyhorseway.biz',#?
                     'hotnigerianjobs.com',#job board
                     'members.avjobs.com',#job board
                     'eBay'#ebay model airplane listings and such
                    ]
news_airlines=news[news.apply(lambda x: x['language']=='en' and x['airline_match_score']>0.75
                              and x['source'] not in exclude_news_source, axis=1)]
news_airlines['minilm_embeddings']=news_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
news_airlines['mpnet_embeddings']=news_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)
news_airlines.shape

100%|██████████| 16029/16029 [01:41<00:00, 158.14it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_airlines['minilm_embeddings']=news_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
100%|██████████| 16029/16029 [03:52<00:00, 68.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_airlines['mpnet_embeddings']=news_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)


(16029, 37)

In [14]:
blog_airlines=blog[blog.apply(lambda x: x['language']=='en' and x['airline_match_score']>0.75, axis=1)]
blog_airlines['mpnet_embeddings']=blog_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)
blog_airlines['minilm_embeddings']=blog_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
blog_airlines.shape

100%|██████████| 234/234 [00:03<00:00, 67.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blog_airlines['mpnet_embeddings']=blog_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)
100%|██████████| 234/234 [00:01<00:00, 181.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blog_airlines['minilm_embeddings']=blog_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)


(234, 35)

In [15]:
exclude_social=['laxradar - Twitter',#automated twitter account, not valuable for analysis
                'sarpy_spotter',#automated twitter account,
                'centralspotter',#automated twitter account
                'AboveStLouis',#automated twitter account
                'RI_Aircraft',#automated twitter account
                'LHRFlightBot',#automated twitter account
                'skyoverhavant',#automated twitter account
                'whats_above_SE1',#automated twitter account
                'abovestockport'#automated twitter account
                'LAS Runways - YouTube']

social_airlines=social[social.apply(lambda x: x['language']=='en' and x['airline_match_score']>0.75
                                    and x['source'] not in exclude_social, axis=1)]
social_airlines['mpnet_embeddings']=social_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)
social_airlines['minilm_embeddings']=social_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
social_airlines.shape

100%|██████████| 23314/23314 [03:26<00:00, 112.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_airlines['mpnet_embeddings']=social_airlines.progress_apply(lambda x: mpnet_embedding_model.encode(x['all_text']), axis=1)
100%|██████████| 23314/23314 [01:36<00:00, 242.83it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_airlines['minilm_embeddings']=social_airlines.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)


(23314, 37)

In [16]:
#news['minilm_embeddings']=news.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
social['minilm_embeddings']=social.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
blog['minilm_embeddings']=blog.progress_apply(lambda x: minilm_embedding_model.encode(x['all_text']), axis=1)
data save checkpoint
with open('news.pickle', 'wb') as f:
    pickle.dump(news, f)
with open('blog.pickle', 'wb') as f:
    pickle.dump('blog', f)
with open('social.pickle', 'wb') as f:
    pickle.dump('social.pickle',f)

100%|██████████| 85665/85665 [06:01<00:00, 237.11it/s]
100%|██████████| 2726/2726 [00:38<00:00, 70.65it/s] 


### Sentiment Analysis

In [17]:
sent_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
sentiment_task = pipeline("sentiment-analysis", model='cardiffnlp/twitter-roberta-base-sentiment-latest', tokenizer=sent_tokenizer,device=0)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
#news_airlines['sentiment']=news_airlines['all_text'].progress_map(lambda x: sentiment_task(x,return_all_scores=True,**tokenizer_kwargs))
social_airlines['sentiment']=social_airlines['all_text'].progress_map(lambda x: sentiment_task(x,return_all_scores=True,**tokenizer_kwargs))
blog_airlines['sentiment']=blog_airlines['all_text'].progress_map(lambda x: sentiment_task(x,return_all_scores=True,**tokenizer_kwargs))

100%|██████████| 23314/23314 [02:17<00:00, 170.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_airlines['sentiment']=social_airlines['all_text'].progress_map(lambda x: sentiment_task(x,return_all_scores=True,**tokenizer_kwargs))
100%|██████████| 234/234 [00:02<00:00, 91.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blog_airlines['sentiment']=blog_airlines['all_text'].progress_map(lambda x: sentiment_task(x,return_all_scores=True,**tokenizer_kwargs))


In [19]:
with open('news.pickle', 'wb') as f:
    pickle.dump(news, f)
with open('blog.pickle', 'wb') as f:
    pickle.dump(blog, f)
with open('social.pickle', 'wb') as f:
    pickle.dump(social,f)