# Sentiment Analysis on Twitter US Airline Sentiment Dataset

### Load Data

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pandas as pd

In [3]:
train = pd.read_csv('twitter_x_y_train.csv')
test = pd.read_csv('twitter_x_test.csv')
print("Tweets in training data:{}".format(train.shape[0]))
print("Tweets in testing data:{}".format(test.shape[0]))
print("Total Tweets --> ",train.shape[0]+test.shape[0])

Tweets in training data:10980
Tweets in testing data:3660
Total Tweets -->  14640


In [4]:
#print(train.info())

In [5]:
#train.head(5)

In [6]:
print(train.airline_sentiment.value_counts())

negative    6851
neutral     2327
positive    1802
Name: airline_sentiment, dtype: int64


In [7]:
train.columns

Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

# Clean data

In [8]:
drop_cols = ['tweet_id', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count','tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone']
train.drop(drop_cols, axis = 1, inplace=True)
test.drop(drop_cols, axis = 1, inplace=True)

In [9]:
train

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,@united so our flight into ORD was delayed bec...
...,...,...
10975,neutral,@AmericanAir followback
10976,positive,@united thanks for the help. Wish the phone re...
10977,negative,@usairways the. Worst. Ever. #dca #customerser...
10978,negative,@nrhodes85: look! Another apology. DO NOT FLY ...


In [10]:
train_data = (train["text"])
target_data = set(train['airline_sentiment'])
test_data = (test["text"])

In [11]:
tweets_train = []
tweets_test = []
for sentiment in target_data:
    
    #---->Training data (text,sentiment)
    sentiment_rows_train = (train['airline_sentiment'] == sentiment)
    current_train=train[sentiment_rows_train]
    current_train.reset_index(drop=True,inplace=True)
    for tweet in (list(current_train['text'])):
        tweets_train.append(((word_tokenize(tweet)), sentiment))    
    
#---->Testing data (text)
for j in test_data:
    tweets_test.append((word_tokenize(j)))

In [12]:
print(len(tweets_train))
print(len(tweets_test))

10980
3660


In [13]:
#tweets_train[:5]
#tweets_test[:5]

In [14]:
import random
random.shuffle(tweets_train)
#tweets_train[0:5]

### WordNetLemmatizer and Pos Tag

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)

stops.update(punctuations)
#stops

In [18]:
def get_clean_words(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [19]:
tweets_train = [(get_clean_words(tweet),sentiment) for tweet,sentiment in tweets_train]
len(tweets_train)

10980

In [20]:
tweets_test = [get_clean_words(tweet) for tweet in tweets_test]
len(tweets_test)

3660

# Convert data in X(2d array) and Y(Target) format

In [21]:
text_tweet = [" ".join(tweet) for tweet, category in tweets_train]

In [22]:
sentiments = [sentiment for document, sentiment in tweets_train]

In [23]:
text_tweet_test = [" ".join(tweet) for tweet in tweets_test]

# Train_Test Split

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# default 75:25
x_train, x_test, y_train, y_test = train_test_split(text_tweet, sentiments)

# TfidfVectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
Tfid_Vectorizer=TfidfVectorizer(ngram_range=(1,2),max_features=2000)
x_train_features = Tfid_Vectorizer.fit_transform(x_train)
x_train_features.todense()   

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
#count_vec.get_feature_names()

In [29]:
x_test_features = Tfid_Vectorizer.transform(x_test)

In [30]:
x_test_features

<2745x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 26175 stored elements in Compressed Sparse Row format>

# sklearn SVC Classfiers

In [31]:
from sklearn.svm import SVC

In [32]:
svc = SVC(kernel='rbf',C=100,gamma=0.0001)
svc.fit(x_train_features, y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [33]:
print("Training accuracy:",svc.score(x_train_features, y_train))

Training accuracy: 0.6329083181542198


In [34]:
print("Training accuracy:",svc.score(x_test_features, y_test))

Training accuracy: 0.6225865209471767


# sklearn Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
clf = RandomForestClassifier(n_estimators=10,min_samples_split=5)

In [37]:
clf.fit(x_train_features,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
print("Training accuracy:",clf.score(x_train_features, y_train))

Training accuracy: 0.9634486945962356


In [39]:
print("Testing accuracy:",clf.score(x_test_features, y_test))

Testing accuracy: 0.7449908925318761


# Predict twitter_x_test (text_tweet_test)

In [40]:
test_tweet = Tfid_Vectorizer.transform(text_tweet_test)
test_pred = clf.predict(test_tweet)

In [41]:
df = pd.DataFrame(test_pred)
#df.to_csv('twitter_randomforest_pred.csv',header=False,index=False)