In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer # Tokenizer that removes punctuation
from nltk.stem import WordNetLemmatizer
import re

In [2]:
df = pd.read_csv('cleaned_tweets.csv')

In [3]:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text.lower())
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return tokens

In [4]:
df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_text'].head()

0                      [virginamerica, dhepburn, said]
1    [virginamerica, plus, added, commercial, exper...
2    [virginamerica, today, must, mean, need, take,...
3    [virginamerica, really, aggressive, blast, obn...
4             [virginamerica, really, big, bad, thing]
Name: processed_text, dtype: object

In [5]:
max_length = df['processed_text'].apply(lambda x: len(x)).max()
print(max_length)

26


In [6]:
df['processed_text'].apply(lambda x: len(x)).describe()

count    14485.000000
mean        10.490369
std          4.132354
min          1.000000
25%          7.000000
50%         11.000000
75%         13.000000
max         26.000000
Name: processed_text, dtype: float64

In [7]:
words_to_remove = {
    'virginamerica', 'jetblue', 'usairways', 'united',
    'southwest', 'southwestair', 'americanair', 'delta',
    'rt', 'url', 'http'
}

def is_valid_word(word):
    return (
        isinstance(word, str) and                    # ensure input is a string
        len(word) > 2 and                            # longer than 2 characters
        word.lower() not in words_to_remove and       # not in custom junk list
        re.fullmatch(r'[A-Za-z]+', word) is not None and  # only letters
        re.search(r'(.)\1{2,}', word) is None        # no repeating chars like "aaa"
    )

word_counts_grouped = (
    df[df['airline_sentiment'] == 'negative'] # filter for negative sentiment tweets
    .explode('processed_text') # explode token list into separate rows
    .rename(columns={'processed_text': 'keyword'})
    .loc[:, ['airline', 'negativereason', 'tweet_id', 'keyword']] # select relevant columns
)

In [None]:
# filter out invalid words
word_counts_grouped = word_counts_grouped.dropna(subset=['keyword'])

word_counts_grouped = word_counts_grouped[word_counts_grouped['keyword'].apply(is_valid_word)]

# group and count keywords
word_counts_grouped = (
    word_counts_grouped
    .groupby(['keyword', 'airline', 'negativereason'])
    .size()
    .reset_index(name='count')
)

In [9]:
# remove words that appear only once
word_counts_grouped = word_counts_grouped[word_counts_grouped['count'] > 1]

In [10]:
word_counts_grouped

Unnamed: 0,keyword,airline,negativereason,count
3,aadvantage,American,Customer Service Issue,2
4,aadvantage,US Airways,Customer Service Issue,2
16,abc,Delta,Bad Flight,2
31,able,American,Cancelled Flight,3
32,able,American,Customer Service Issue,12
...,...,...,...,...
33986,zero,US Airways,Customer Service Issue,3
33989,zero,United,Bad Flight,2
33990,zero,United,Can't Tell,2
33991,zero,United,Customer Service Issue,4


In [11]:
max(word_counts_grouped['count'])

288

In [12]:
word_counts_grouped.to_csv('keywords.csv', index=False)