# Project NLP: Twitter US Airline Sentiment

## Problem Description
A sentiment analysis job about the problems of each major U.S. airline. Twitter data was scraped from February of 2015 and contributors were asked to first classify positive, negative, and neutral tweets, followed by categorizing negative reasons (such as "late flight" or "rude service").

## Importa Packages

In [1]:
import re, string, unicodedata                        
from bs4 import BeautifulSoup                         

import numpy as np                                    
import pandas as pd                                   
import nltk                                           

nltk.download('stopwords')                            
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords                     
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer       

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Import data

In [2]:
data = pd.read_csv("Tweets.csv")

In [3]:
data.shape

(14640, 15)

In [4]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
data = data.loc[:, ['airline_sentiment', 'text']]

In [6]:
pd.set_option('display.max_colwidth', None)
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!
3,negative,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,negative,@VirginAmerica and it's a really big bad thing about it


In [7]:
data.groupby("airline_sentiment").agg({'airline_sentiment': 'count'})

Unnamed: 0_level_0,airline_sentiment
airline_sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


The data is skewed. There are much more negative sentiments compared to positive or neutral sentiments

In [8]:
data.shape      

(14640, 2)

# Preprocessing of Text

In [9]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)        # Append processed words to new list.
    return new_words


def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []                           # Create empty list to store pre-processed words.
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)              # Append processed words to new list.
    return lemmas

def normalize(words):
    words = remove_stopwords(words)
     words = lemmatize_verbs(words)
    return words

In [10]:
# Iterate the normalize funtion over whole data.
for i, row in data.iterrows():
    words = data.at[i, 'text']
    words = normalize(words)
    data.at[i,'text'] = words

In [11]:
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,"[v, r, g, n, e, r, c, , w, h, , h, e, p, b, u, r, n, ]"
1,positive,"[v, r, g, n, e, r, c, , p, l, u, , u, v, e, , e, , c, e, r, c, l, , , h, e, , e, x, p, e, r, e, n, c, e, , c, k]"
2,neutral,"[v, r, g, n, e, r, c, , , n, , , u, , e, n, , , n, e, e, , , k, e, , n, h, e, r, , r, p]"
3,negative,"[v, r, g, n, e, r, c, , , r, e, l, l, , g, g, r, e, v, e, , , b, l, , b, n, x, u, , e, n, e, r, n, e, n, , n, , u, r, , g, u, e, , f, c, e, , p, , h, e, , h, v, e, , l, l, e, , r, e, c, u, r, e]"
4,negative,"[v, r, g, n, e, r, c, , n, , , , r, e, l, l, , b, g, , b, , h, n, g, , b, u, ]"


### Remove HTML

In [12]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['text'] = data['text'].apply(lambda x: strip_html(x))
data.head()

TypeError: expected string or bytes-like object

### Remove Numbers and Special Characters

In [None]:
import re
data['text'] = data['text'].apply(lambda x:  re.sub("[^a-zA-Z]"," ",x ))

In [None]:
data.head()

### Convert to lower case

In [None]:
data['text'] = data['text'].apply(lambda x:x.lower())

In [None]:
data.head()

In [None]:
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) 

In [None]:
data.head()

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

In [None]:
def lemmatize(words):
    #words = remove_stopwords(words)
    words = lemmatize_list(words)
    return ' '.join(words)

In [None]:
data['text'] = data.apply(lambda row: lemmatize(row['text']), axis=1)
data.head()

In [None]:
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) 
data.head()

In [None]:
stopwords = stopwords.words('english')
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words

In [None]:
def stopwords(words):
    #words = remove_stopwords(words)
    words = remove_stopwords(words)
    return ' '.join(words)

In [None]:
data['text'] = data.apply(lambda row: stopwords(row['text']), axis=1)
data.head()