# Twitter US Airline Sentiment Analysis

#### Problem Statement

##### Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets.
##### Our task is to predict the sentiment of the tweet. 
    i.e. - positive, 
         - negative 
         - neutral.

In [10]:
# necessary imports 

from pandas import read_csv
from pandas import Series
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from string import punctuation
from numpy import savetxt


### Data Gathereing and Data Preparation 

In [11]:
# loading the data
train_data = read_csv(r'C:\Users\vamsi katam\Desktop\CN\Datasets\twitter_US _airline_sentiment_analysis\training_twitter_x_y_train.csv')
test_data  = read_csv(r'C:\Users\vamsi katam\Desktop\CN\Datasets\twitter_US _airline_sentiment_analysis\test_twitter_x_test.csv')

In [12]:
# tain and test data shape
print("Train data :",train_data.shape)
print("Test data :",test_data.shape)

Train data : (10980, 12)
Test data : (3660, 11)


In [13]:
# lets have a look into the train_data

train_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


##### since our lables are whether the tweet is positive,negative or neutral.
##### The most needed features in the data are 
  - airline_sentiment
  - text

#### let prepare our train and test data 

In [14]:
train_X = train_data['text']
train_y = train_data['airline_sentiment']

test_X = test_data['text']

In [15]:
# have a look on the data
train_X[1]

'@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!'

##### cleaning the data

In [16]:
# preparing the stop words

stop_words = set(stopwords.words('english'))
# taking the punctuation from the string module
punctuations = list(punctuation)
# adding the punctuation in the end of stop_words
stop_words.update(punctuations)

stop_words

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [17]:
# function to return simple version of the pos_tag which can be used in lemmatization
def get_simple_pos_tag(tag):
    "take the pos_tag which and will return the simplified version of Wordnet format"
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# creating object for WordLemmatizer
lemma = WordNetLemmatizer()

def clean_tweet(tweet):
    # tokenizing the sentence to words
    words = word_tokenize(tweet)
    # output cleaned words
    cleaned_words = []
    
    # iterating over each word
    for word in words:
        # handling stop words
        if word.lower() not in stop_words:
            
            # getting the pos_tag of the word, passing in a list/array to get pos_tag for word rather character
            word_pos = pos_tag([word])
            
            #lemmetizing the word 
            lemmetized_word = lemma.lemmatize(word,get_simple_pos_tag(word_pos[0][1]))
            
            # appending the word to cleaned words
            cleaned_words.append(lemmetized_word.lower())
    
    # joining all the words to create a sentence
    cleaned_tweet = ' '.join(cleaned_words)
    
    # returning the cleaned_tweet
    return cleaned_tweet

In [18]:
# testing the cleaned_tweet function
li = 'Hi there! How are you buddy ?'
clean_tweet(li)

'hi buddy'

##### seems good! , lets clean our text feature

In [20]:
cleaned_X = [clean_tweet(tweet) for tweet in train_X]
cleaned_test_X = [clean_tweet(tweet) for tweet in test_X]

In [93]:
count_vec = CountVectorizer(max_features= 1000)
X_trained_features = count_vec.fit_transform(cleaned_X)
X_test_features = count_vec.transform(cleaned_test_X)

In [94]:
# lets see the top feature names
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '11',
 '12',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '20',
 '200',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '25',
 '2hrs',
 '2nd',
 '30',
 '35',
 '3rd',
 '40',
 '45',
 '50',
 '60',
 '728',
 '75',
 '800',
 '90',
 'aa',
 'able',
 'absolute',
 'absolutely',
 'accept',
 'acceptable',
 'access',
 'accommodate',
 'account',
 'actual',
 'actually',
 'add',
 'address',
 'admiral',
 'advise',
 'advisory',
 'afternoon',
 'agent',
 'ago',
 'air',
 'aircraft',
 'airline',
 'airplane',
 'airport',
 'airway',
 'all',
 'allow',
 'almost',
 'alone',
 'already',
 'also',
 'always',
 'amaze',
 'america',
 'american',
 'americanair',
 'americanairlines',
 'amp',
 'angry',
 'announce',
 'announcement',
 'another',
 'answer',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apologize',
 'apology',
 'app',
 'apparently',
 'appear',
 'apply',
 'appreciate',
 'area',
 'around',
 'arrival',
 'arrive',
 'asap',
 'ask',
 'assign',
 'assist',
 'assistance',
 'a

### Model Creation 

In [96]:
# Lets try the SVC

svc = SVC()
svc.fit(X_trained_features,train_y)

SVC()

In [98]:
svc_pred = svc.predict(X_test_features)

# score on train data, althout train score won't help much for us
svc.score(X_trained_features,train_y)

In [110]:
Series(svc_pred).to_csv('svc_pred.csv',index=False,header=False)

##### we are getting accuracy score of 0.774 with SVM on base features

In [None]:
# Lets tune the SVM model
