# Using Classification Methods on Twitter Airline Sentiments`

This assignment is about using ML Classification Methods to predict whether a Airline had positive, negative, or neutral sentiments when a person tweeted at an airline. 

In [29]:
import pandas as pd

df = pd.read_csv('Tweets.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,text,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,tweet_coord,tweet_created,tweet_location,user_timezone
0,@VirginAmerica What @dhepburn said.,neutral,1.0,,,Virgin America,,cairdin,,0,,2/24/2015 11:35,,Eastern Time (US & Canada)
1,@VirginAmerica plus you've added commercials t...,positive,0.3486,,0.0,Virgin America,,jnardino,,0,,2/24/2015 11:15,,Pacific Time (US & Canada)
2,@VirginAmerica I didn't today... Must mean I n...,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,,2/24/2015 11:15,Lets Play,Central Time (US & Canada)
3,@VirginAmerica it's really aggressive to blast...,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,,2/24/2015 11:15,,Pacific Time (US & Canada)
4,@VirginAmerica and it's a really big bad thing...,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,,2/24/2015 11:14,,Pacific Time (US & Canada)


In [30]:
df.shape

(14640, 14)

In [78]:
#grouping the airline's sentiment
t_gb = df.groupby(["airline", "airline_sentiment" ]).count()["name"]
t_gb

airline         airline_sentiment
American        negative             1960
                neutral               463
                positive              336
Delta           negative              955
                neutral               723
                positive              544
Southwest       negative             1186
                neutral               664
                positive              570
US Airways      negative             2263
                neutral               381
                positive              269
United          negative             2633
                neutral               697
                positive              492
Virgin America  negative              181
                neutral               171
                positive              152
Name: name, dtype: int64

In [79]:
#resseting indexing and formatting to column view
df_airlineSentiment = t_gb.to_frame().reset_index()
df_airlineSentiment.columns = ["airline", "airline_sentiment", "count"]
dff = df_airlineSentiment
dff

Unnamed: 0,airline,airline_sentiment,count
0,American,negative,1960
1,American,neutral,463
2,American,positive,336
3,Delta,negative,955
4,Delta,neutral,723
5,Delta,positive,544
6,Southwest,negative,1186
7,Southwest,neutral,664
8,Southwest,positive,570
9,US Airways,negative,2263


In [33]:
print("Total Positive Sentiment for United Airline is", t_gb['United','positive'])

Total Positive Sentiment for United Airline is 492


In [20]:
print("Total Negative Sentiment for Delta Airline is", t_gb['Delta','negative'])

Total Negative Sentiment for Delta Airline is 955


In [34]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [35]:
np.set_printoptions(precision=2)

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)

In [37]:
df.loc[0, 'text']

'@VirginAmerica What @dhepburn said.'

In [38]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [39]:
preprocessor(df.loc[0, 'text'])

' virginamerica what dhepburn said '

In [40]:
df['text'] = df['text'].apply(preprocessor)

In [41]:
#import Natural Lanuage Toolkit 
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [42]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
X_train = df.iloc[:4000, 0].values
y_train = df.iloc[:4000, 1].values
X_test = df.iloc[4000:8000, 0].values
y_test = df.iloc[4000:8000, 1].values

In [45]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier 

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__stop_words': [stop, None]},
              ]

#lr_tfidf = Pipeline([('vect', tfidf),
#                    ('clf', LogisticRegression(random_state=0))])

lr_tfidf = Pipeline([('vect', tfidf),
                    ('sgd', SGDClassifier(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1)

In [67]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   27.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...lty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x00000198B986A620>, <function tokenizer_porter at 0x00000198B986A8C8>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yo...', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"], None]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [68]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x00000198B986A620>} 
CV Accuracy: 0.739


In [69]:
clf = gs_lr_tfidf.best_estimator_

In [70]:
clf.predict(X_test)

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [72]:
example = ['I hate this flight!']
clf.predict(example)

array(['negative'], dtype='<U8')

In [73]:
example = ['I love this flight!']
clf.predict(example)

array(['positive'], dtype='<U8')

In [75]:
import pickle
mypickle_path = 'flightPickle.pkl'
flightpickle = open(mypickle_path, 'wb')
pickle.dump(clf, flightpickle)
flightpickle.close()

In [76]:
mypickle_path = 'flightPickle.pkl'
model_unpickle = open(mypickle_path, 'rb')
clf_new = pickle.load(model_unpickle)

In [77]:
example = ['I love this flight!']
clf_new.predict(example)

array(['positive'], dtype='<U8')