# Using Classification Methods on Twitter Airline Sentiments`

This assignment is about using ML Classification Methods to predict whether a Airline had positive, negative, or neutral sentiments when a person tweeted at an airline. 

In [1]:
import pandas as pd

df = pd.read_csv('Tweets.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,text,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,tweet_coord,tweet_created,tweet_location,user_timezone
0,@VirginAmerica What @dhepburn said.,neutral,1.0,,,Virgin America,,cairdin,,0,,2/24/2015 11:35,,Eastern Time (US & Canada)
1,@VirginAmerica plus you've added commercials t...,positive,0.3486,,0.0,Virgin America,,jnardino,,0,,2/24/2015 11:15,,Pacific Time (US & Canada)
2,@VirginAmerica I didn't today... Must mean I n...,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,,2/24/2015 11:15,Lets Play,Central Time (US & Canada)
3,@VirginAmerica it's really aggressive to blast...,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,,2/24/2015 11:15,,Pacific Time (US & Canada)
4,@VirginAmerica and it's a really big bad thing...,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,,2/24/2015 11:14,,Pacific Time (US & Canada)


In [2]:
df.shape

(14640, 14)

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [4]:
np.set_printoptions(precision=2)

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)

In [6]:
df.loc[0, 'text']

'@VirginAmerica What @dhepburn said.'

In [7]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [8]:
preprocessor(df.loc[0, 'text'])

' virginamerica what dhepburn said '

In [9]:
df['text'] = df['text'].apply(preprocessor)

In [10]:
#import Natural Lanuage Toolkit 
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [11]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
X_train = df.iloc[:2000, 0].values
y_train = df.iloc[:2000, 1].values
X_test = df.iloc[2000:4000, 0].values
y_test = df.iloc[2000:4000, 1].values

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [50]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__stop_words': [stop, None]},
              ]

#lr_tfidf = Pipeline([('vect', tfidf),
#                    ('clf', LogisticRegression(random_state=0))])

lr_tfidf = Pipeline([('vect', tfidf),
                    ('sgd', SGDClassifier(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1)

In [65]:
gs_lr_tfidf.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'float' and 'str'

In [61]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [49]:
clf = gs_lr_tfidf.best_estimator_

In [50]:
clf.predict(X_test)

array(['neutral', 'negative', 'positive', ..., 'neutral', 'negative',
       'negative'], dtype='<U8')

In [51]:
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.820


In [52]:
example = ['I hate this flight!']
clf.predict(example)

array(['negative'], dtype='<U8')

In [53]:
example = ['I love this movie!']
clf.predict(example)

array(['positive'], dtype='<U8')