## Dataset:
https://www.kaggle.com/chrisbellec/airlines-tweets-sentiments/data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#Read the dataset in csv
tweets = pd.read_csv('./twitter-airline-sentiment/Tweets.csv', sep=',')

In [None]:
#show the first 10 lines
tweets.head(10)

In [None]:
#Filter for the category
is_positive = tweets['airline_sentiment'].str.contains("positive")
is_negative = tweets['airline_sentiment'].str.contains("negative")
is_neutral = tweets['airline_sentiment'].str.contains("neutral")

### Some statistics about airlines:

In [None]:
positive_tweets = tweets[is_positive]
positive_tweets.shape

In [None]:
negative_tweets = tweets[is_negative]
negative_tweets.shape

In [None]:
neutral_tweets = tweets[is_neutral]
neutral_tweets.shape

In [None]:
worst_airline = negative_tweets[['airline','airline_sentiment_confidence','negativereason']]
worst_airline

In [None]:
# Create the rank for the worst airline
cnt_worst_airline = worst_airline.groupby('airline', as_index=False).count()
cnt_worst_airline.sort_values('negativereason', ascending=False)

In [None]:
# Create the rank for the best airline
best_airline = positive_tweets[['airline','airline_sentiment_confidence']]
cnt_best_airline = best_airline.groupby('airline', as_index=False).count()
cnt_best_airline.sort_values('airline_sentiment_confidence', ascending=False)

In [None]:
# Create the rank for negative reason
motivation = negative_tweets[['airline','negativereason']]
cnt_bad_flight_motivation = motivation.groupby('negativereason', as_index=False).count()
cnt_bad_flight_motivation.sort_values('negativereason', ascending=False)

## Classification of flight

In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

import string
string.punctuation

In [None]:
# Set the useless words:
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)

In [None]:
def build_bag_of_words_features_filtered(words):
    return {
        word:1 for word in words \
        if not word in useless_words}

### Tokenizing:

In [None]:
tokenized_negative_tweets = []
for text in negative_tweets['text']:
        tokenized_negative_tweets.append(nltk.word_tokenize(text))
        #negative_words.extend(nltk.word_tokenize(text)) 
        
tokenized_negative_tweets

In [None]:
negative_features = [
    (build_bag_of_words_features_filtered(text), 'neg') \
    for text in tokenized_negative_tweets
]

In [None]:
print(negative_features)

In [None]:
tokenized_positive_tweets = []
for text in positive_tweets['text']:
        tokenized_positive_tweets.append(nltk.word_tokenize(text))
        #negative_words.extend(nltk.word_tokenize(text)) 
        
tokenized_positive_tweets

In [None]:
positive_features = [
    (build_bag_of_words_features_filtered(text), 'pos') \
    for text in tokenized_positive_tweets
]

In [None]:
# Neutral features

tokenized_neutral_tweets = []
for text in neutral_tweets['text']:
        tokenized_neutral_tweets.append(nltk.word_tokenize(text))
        #negative_words.extend(nltk.word_tokenize(text)) 
        
tokenized_neutral_tweets


In [None]:
neutral_features = [
    (build_bag_of_words_features_filtered(text), 'neu') \
    for text in tokenized_neutral_tweets
]

### Training classifier

In [2]:
from nltk.classify import NaiveBayesClassifier

In [None]:
len(negative_features)
len(positive_features)
split = 2000
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])
nltk.classify.util.accuracy(sentiment_classifier, positive_features[:split]+negative_features[:split])*100


In [None]:
positive_features_verify = positive_features[split:]
negative_features_verify = negative_features[split:2363]
nltk.classify.util.accuracy(sentiment_classifier, positive_features_verify+negative_features_verify)*100

## Sentiment analysis on bad flight motivation

In [None]:
is_costumer_service_issue = negative_tweets['negativereason'].str.contains("Customer Service Issue")
costumer_service_issue = negative_tweets[is_costumer_service_issue]
costumer_service_issue.shape

In [None]:
tokenized_costumer_service_issue = []
for text in costumer_service_issue['text']:
        tokenized_costumer_service_issue.append(nltk.word_tokenize(text))

        
tokenized_costumer_service_issue

In [None]:

costumer_service_issue_features = [
    (build_bag_of_words_features_filtered(text), 'service_issue') \
    for text in tokenized_costumer_service_issue
]


In [None]:
is_late_flight = negative_tweets['negativereason'].str.contains("Late Flight")
late_flight = negative_tweets[is_late_flight]
late_flight.shape

In [None]:
tokenized_late_flight = []
for text in late_flight['text']:
        tokenized_late_flight.append(nltk.word_tokenize(text))


In [None]:
late_flight_features = [
    (build_bag_of_words_features_filtered(text), 'late_flight') \
    for text in tokenized_late_flight
]

In [None]:
is_cant_tell = negative_tweets['negativereason'].str.contains("Can't Tell")
cant_tell = negative_tweets[is_cant_tell]
cant_tell.shape


In [None]:
test = is_costumer_service_issue | is_late_flight 
others = negative_tweets[~test]
others.shape


In [None]:
tokenized_other = []
for text in others['text']:
        tokenized_other.append(nltk.word_tokenize(text))


In [None]:
other_features = [
    (build_bag_of_words_features_filtered(text), 'other') \
    for text in tokenized_other
]

In [None]:
split = 1000

bad_cause_classifier = NaiveBayesClassifier.train(costumer_service_issue_features[:split]+late_flight_features[:split]+other_features[:split])

In [None]:
nltk.classify.util.accuracy(bad_cause_classifier, costumer_service_issue_features[:split]+late_flight_features[:split]+other_features[:split])*100

In [None]:
costumer_service_verify = costumer_service_issue_features[split:1400]
late_flight_verify = late_flight_features[split:1400]
others_verify = other_features[split:1400]

In [None]:
nltk.classify.util.accuracy(bad_cause_classifier, costumer_service_verify+late_flight_verify+others_verify)*100