# Assignment #1 - Isaac Braun
Four (4) interesting findings on the [Twitter US Airline Sentiment](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment?resource=download) data using [spaCy](spacy.io).

## Setup

In [147]:
# Import packages and load model
import spacy
from collections import Counter
from spacy.matcher import Matcher
import pandas as pd

nlp = spacy.load("en_core_web_sm")

# Load Data
raw = pd.read_csv('data/airline_tweets_raw.csv')
# Limit tweets to 9000 rows and convert to list
limited = raw.iloc[0:9000].text.tolist()

docs = [nlp(tweet) for tweet in limited]

# Concat tweets into one string, limit to 9000 rows (avoiding spacy memory limit)
# tweets = " ".join(raw.iloc[0:9000].text)

# Load into Spacy Pipeline
# doc = nlp(tweets)

# #1 - Find Most Common Tokens

In [145]:
# Strip out stop words, punctuation, numbers, and spaces
words = [token.text for doc in docs for token in doc if token.is_stop != True and token.is_punct != True and token.is_digit != True and token.is_space != True]
# words = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.is_digit != True and token.is_space != True]

# Count the most common words
common_words = Counter(words).most_common(20)
common_words

[('@united', 3703),
 ('@SouthwestAir', 2379),
 ('@JetBlue', 2167),
 ('flight', 1830),
 ('Cancelled', 521),
 ('service', 508),
 ('@VirginAmerica', 491),
 ('time', 480),
 ('Flight', 430),
 ('flights', 400),
 ('customer', 388),
 ('help', 368),
 ('amp', 363),
 ('plane', 349),
 ('Thanks', 348),
 ('thanks', 329),
 ('delayed', 307),
 ('hours', 296),
 ('bag', 289),
 ('$', 271)]


# Find Global Mentions
Find how many tweets have a period ('.') before the first account mention ('@'), with an optional space (' ') in between

In [176]:
matcher = Matcher(nlp.vocab)

mention_pattern = [
    [{'TEXT': {'REGEX': '\.'}}, {'TEXT': {'REGEX': '@'}}],
    [{'TEXT': {'REGEX': '\.'}}, {'TEXT': {'REGEX': ' '}}, {'TEXT': {'REGEX': '@'}}]
]

matcher.add("GlobalMention", mention_pattern)

matched_docs = []

for doc in docs:
    matches = matcher(doc)
    
    # If matches found, append doc to matched_docs with matches
    if matches:
        matched_docs.append({
            'doc': doc,
            'matches': matches
        })

print(f"Number of Tweets with Global Mentions found: {len(matched_docs)}\n")

# Output sample of matched docs
for i in range(0, 10):
    for match_id, start, end in matched_docs[i]['matches']:
        span = matched_docs[i]['doc'][start:end]
        print(span.text)


Number of Tweets with Global Mentions found: 64

. @RenttheRunway
http://t.co/FVUdmh27pF @TheNationalUAE
http://t.co/DCoBoKN7EE @TheNationalUAE
http://t.co/vw4P4T4tLh @TheNationalUAE
. @flysaa
. @imran_r44
. @SouthwestAir
. @seanMFmadden
. @reagan
. @BA_USA


In [138]:
# spacy.displacy.render(nlp(str(words)), style='dep', jupyter=True)

In [177]:
# Render dependency tree for first 10 docs
for i in range(0, 10):
    spacy.displacy.render(matched_docs[i]['doc'], style='dep', jupyter=True)