In [1]:
#Importing required libraries
import pandas as pd
import nltk

In [2]:
#Reading the dataset
text_sentiment_data = pd.read_csv('tweets.csv')

In [3]:
text_sentiment_data.head(10)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
8,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [4]:
text_sentiment_data.shape

(14640, 15)

In [5]:
#Filtering out entries with low confidence
text_sentiment_df = text_sentiment_data.drop(text_sentiment_data[text_sentiment_data['airline_sentiment_confidence']<0.5].index, axis= 0)

In [6]:
text_sentiment_df.shape

(14404, 15)

In [7]:
X = text_sentiment_df['text']
Y = text_sentiment_df['airline_sentiment']

In [8]:
# Cleaning our text data:

from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
#Initializing stopwords and punctuation so we can remove them from our text data
words_stop = stopwords.words('english')
punctuations = string.punctuation

In [10]:
#With the help of regular expressions we will consider only the text part and remove all the numbers, special characters from the text data
import re
nltk.download('wordnet')

#First take text data and convert it to lower case and then lemmatize it (reduce it to its lower form, for exapmle: walked -> walk)
cleaned_text_data = []
for i in range(len(X)):
  text = re.sub('[^a-zA-Z]', ' ',X.iloc[i])
  text = text.lower().split()
  text = [lemmatizer.lemmatize(word) for word in text if (word not in words_stop) and (word not in punctuations)]
  text = ' '.join(text)
  cleaned_text_data.append(text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
cleaned_text_data

['virginamerica dhepburn said',
 'virginamerica today must mean need take another trip',
 'virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse',
 'virginamerica really big bad thing',
 'virginamerica seriously would pay flight seat playing really bad thing flying va',
 'virginamerica yes nearly every time fly vx ear worm go away',
 'virginamerica really missed prime opportunity men without hat parody http co mwpg grezp',
 'virginamerica well',
 'virginamerica amazing arrived hour early good',
 'virginamerica know suicide second leading cause death among teen',
 'virginamerica lt pretty graphic much better minimal iconography',
 'virginamerica great deal already thinking nd trip australia amp even gone st trip yet p',
 'virginamerica virginmedia flying fabulous seductive sky u take stress away travel http co ahlxhhkiyn',
 'virginamerica thanks',
 'virginamerica sfo pdx schedule still mia',
 'virginamerica excited first cross country flight lax mc

In [12]:
Y

0         neutral
2         neutral
3        negative
4        negative
5        negative
           ...   
14634    negative
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14404, dtype: object

In [13]:
# Sentiment labels are converted into numerical representations

sentiments = ['negative' , 'neutral', 'positive']
Y = Y.apply(lambda x: sentiments.index(x))

In [14]:
Y.head()

0    1
2    1
3    0
4    0
5    0
Name: airline_sentiment, dtype: int64

In [15]:
#Vectorizing the text data
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_features = 5000, stop_words = ['virginamerica','united'])
XFit = count_vectorizer.fit_transform(cleaned_text_data).toarray()

In [16]:
XFit.shape

(14404, 5000)

In [17]:
#Creating the model
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
sent_model = MultinomialNB()

In [18]:
#Splitting of training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(XFit,Y, test_size = 0.3)

In [19]:
#Fit the model
sent_model.fit(X_train,Y_train)

In [20]:
#Evaluating the model
y_pred = sent_model.predict(X_test)

In [21]:
#Plotting the confusion matrix
from sklearn.metrics import classification_report

classification = classification_report(Y_test,y_pred)
print(classification)

              precision    recall  f1-score   support

           0       0.82      0.90      0.85      2709
           1       0.61      0.48      0.54       904
           2       0.73      0.65      0.69       709

    accuracy                           0.77      4322
   macro avg       0.72      0.68      0.69      4322
weighted avg       0.76      0.77      0.76      4322



In [22]:
#Saving the model for deployment
import pickle

with open("sent_model.pkl", "wb") as f:
    pickle.dump(sent_model, f)