In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



# Load dataset

In [2]:
dataset = pd.read_csv('tweets.csv')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0.0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0.0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0.0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0.0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0.0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
dataset.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [4]:
dataset.drop(["tweet_id", "airline_sentiment_confidence", "negativereason", 
              "negativereason_confidence", "airline", "airline_sentiment_gold", 
              "name", "negativereason_gold", "retweet_count", "tweet_coord", 
              "tweet_coord", "tweet_created", "tweet_location", "user_timezone"], axis = 1, inplace = True)

In [5]:
dataset.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


# Convert Sentiment into labels

In [6]:
dataset['airline_sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [7]:
dataset.loc[dataset["airline_sentiment"] == 'negative', "airline_sentiment",] = 0
dataset.loc[dataset["airline_sentiment"] == 'neutral', "airline_sentiment",] = 1
dataset.loc[dataset["airline_sentiment"] == 'positive', "airline_sentiment",] = 2

In [8]:
dataset.head()

Unnamed: 0,airline_sentiment,text
0,1,@VirginAmerica What @dhepburn said.
1,2,@VirginAmerica plus you've added commercials t...
2,1,@VirginAmerica I didn't today... Must mean I n...
3,0,@VirginAmerica it's really aggressive to blast...
4,0,@VirginAmerica and it's a really big bad thing...


In [14]:
dataset.isnull().sum()

airline_sentiment    0
text                 1
dtype: int64

In [16]:
dataset.dropna(axis = 0, inplace = True)

# Converting into x,y and training, testing

In [17]:
X = dataset["text"]
y = dataset["airline_sentiment"]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)

In [19]:
x_train.head()

3522    @united @UnitedFlyerHD @United_Airline N26902 ...
131     @VirginAmerica @FiDiFamilies us too! Terrible ...
3600    @united I have flights that don't appear to ha...
65      @VirginAmerica Flight 0736 DAL to DCA 2/24 2:1...
175     @VirginAmerica I'd love to know what your poli...
Name: text, dtype: object

# TF-IDF

In [20]:
tfidf = TfidfVectorizer(min_df = 1, stop_words = 'english', ngram_range=(1,3))

In [21]:
x_traincv = tfidf.fit_transform(x_train)

In [23]:
x_testcv = tfidf.transform(x_test)

In [24]:
y_train = y_train.astype('int')

In [25]:
y_test = y_test.astype('int')

# Model training

In [26]:
classifier = MultinomialNB()

In [27]:
classifier.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Checking accuracy

In [28]:
print("Training accuracy {}".format(classifier.score(x_traincv, y_train)))
print("Testing accuracy {}".format(classifier.score(x_testcv, y_test)))

Training accuracy 0.722278395685878
Testing accuracy 0.6563342318059299


# Saving Model

In [29]:
joblib.dump(classifier, 'nb')

['nb']

In [30]:
joblib.dump(tfidf, 'tfidf')

['tfidf']