# Assignment #1 - Isaac Braun
Four (4) interesting findings on the [Twitter US Airline Sentiment](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment?resource=download) data using [spaCy](spacy.io).

## Setup

In [33]:
# Import packages and load model
import spacy
from collections import Counter
from spacy.matcher import Matcher
from spacy.tokens import Doc
import pandas as pd

nlp = spacy.load("en_core_web_md")

# Load Data
raw = pd.read_csv('data/airline_tweets_raw.csv')
# Limit tweets to 9000 rows and convert to list
limited = raw.iloc[0:9000].text.tolist()

docs = [nlp(tweet) for tweet in limited]

# #1 - Find Most Common Tokens

In [34]:
# Strip out stop words, punctuation, numbers, and spaces
words = [token.text for doc in docs for token in doc if token.is_stop != True and token.is_punct != True and token.is_digit != True and token.is_space != True]
# words = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.is_digit != True and token.is_space != True]

# Count the most common words
common_words = Counter(words).most_common(20)
common_words

[('@united', 3703),
 ('@SouthwestAir', 2379),
 ('@JetBlue', 2167),
 ('flight', 1830),
 ('Cancelled', 521),
 ('service', 508),
 ('@VirginAmerica', 491),
 ('time', 480),
 ('Flight', 430),
 ('flights', 400),
 ('customer', 388),
 ('help', 368),
 ('amp', 363),
 ('plane', 349),
 ('Thanks', 348),
 ('thanks', 329),
 ('delayed', 307),
 ('hours', 296),
 ('bag', 289),
 ('$', 271)]


# #2 - Find Global Mentions
Find how many tweets have a period ('.') before the first account mention ('@'), with an optional space (' ') in between

In [38]:
global_matcher = Matcher(nlp.vocab)

mention_pattern = [
    [{'TEXT': {'REGEX': '\.'}}, {'TEXT': {'REGEX': '@'}}],
    [{'TEXT': {'REGEX': '\.'}}, {'TEXT': {'REGEX': ' '}}, {'TEXT': {'REGEX': '@'}}]
]

global_matcher.add("GlobalMention", mention_pattern)

matched_docs = []

for doc in docs:
    matches = global_matcher(doc)
    
    # If matches found, append doc to matched_docs with matches
    if matches:
        matched_docs.append({
            'doc': doc,
            'matches': matches
        })

print(f"Number of Tweets with Global Mentions found: {len(matched_docs)}\n")

# Output sample of matched docs
for i in range(0, 10):
    for match_id, start, end in matched_docs[i]['matches']:
        span = matched_docs[i]['doc'][start:end]
        print(span.text)


Number of Tweets with Global Mentions found: 64

. @RenttheRunway
http://t.co/FVUdmh27pF @TheNationalUAE
http://t.co/DCoBoKN7EE @TheNationalUAE
http://t.co/vw4P4T4tLh @TheNationalUAE
. @flysaa
. @imran_r44
. @SouthwestAir
. @seanMFmadden
. @reagan
. @BA_USA


# #3 - Find most similar tweets that describe airline service
Match tweets (docs) that have an adjective before the word 'service' and then compare their similarity

In [80]:
# Find phrases with adjective before the noun "service"
def service_matches():
    matcher = Matcher(nlp.vocab)
    
    service_pattern = [
        [{'POS': 'ADJ'}, {'LOWER': 'service'}]
    ]

    matcher.add("ServicePattern", service_pattern)

    matched_docs = []

    for doc in docs:
        matches = matcher(doc)
        
        # If matches found, append doc to matched_docs with matches
        if matches:
            matched_docs.append(doc)
            
    return matched_docs

# Find most similar docs from matched_docs
def most_similar(docs):
    best_match = {}
    
    def test_similarity(best_match, doc1, doc2):
        similarity = doc1.similarity(doc2)
       
        if similarity > best_match.get('similarity', 0): 
            return {
                'doc1': doc1,
                'doc2': doc2,
                'similarity': similarity
            }
        else:
            return best_match

    doc_len = len(docs)
    
    for idx in range(0, doc_len):
        for jdx in range(0, doc_len):
            # Skip if docs are the same
            if idx != jdx:
                best_match = test_similarity(best_match, docs[idx], docs[jdx])
    
    return best_match

matched_docs = service_matches()
print(f"\nNumber of Tweets with Adjective before 'service' found: {len(matched_docs)}\n\n")

best_match = most_similar(matched_docs)
print(f"Best Matched Docs Similarity: {best_match['similarity']}\n")
print(f"Doc 1: {best_match['doc1']}\n")
print(f"Doc 2: {best_match['doc2']}\n")
    


Number of Tweets with Adjective before 'service' found: 87


Best Matched Docs Similarity: 0.923819155727016

Doc 1: @united WORST SERVICE EVER. Denied access to our flight and then moved flight 6 times. How hard is it to schedule a gate? @Delta next time.

Doc 2: @united flt 1583 EWR to SFO excellent service. Friendly flight attendants. made the 6AM flight a very good start to the day.



# #4 - What adjectives are dependencies of an airline handle?

In [77]:
# Find adjective dependencies for a given query
def top_relations(query):
    relations = Counter()
    
    for doc in docs:
        for token in doc:
            if token.dep_ == 'acomp' and token.head.text == query:
                relations[token.text] += 1
        
    return relations.most_common(20)
    
# Output top dependency relations
for query in ['@VirginAmerica', '@Delta', '@AmericanAir', '@SouthwestAir', '@JetBlue', '@united']:
    print(query, ': ', top_relations(query), '\n')

@VirginAmerica :  [] 

@Delta :  [] 

@AmericanAir :  [] 

@SouthwestAir :  [] 

@JetBlue :  [('awesome', 2), ('great', 1), ('full', 1), ('sway', 1), ('correct', 1), ('sorry', 1), ('ridiculous', 1), ('lucky', 1), ('busy', 1)] 

@united :  [('tired', 1)] 

