In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [2]:
tweets=pd.read_csv('Tweets.csv')

In [3]:
tweets.tail()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
14635,569587686496825344,positive,0.3487,,0.0,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0,Customer Service Issue,1.0,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)
14639,569587140490866689,neutral,0.6771,,0.0,American,,daviddtwu,,0,@AmericanAir we have 8 ppl so we need 2 know h...,,2015-02-22 11:58:51 -0800,"dallas, TX",


In [4]:
text_data=tweets.text

In [5]:
text_label=tweets.airline_sentiment

In [6]:
text_data[3224]

'@united Doumented via link. However, now that it has been over four months with no response, what do you suggest? Or shall I not expect one?'

In [7]:
count_vect = CountVectorizer(stop_words='english',ngram_range=(1,2),
                            token_pattern='(?u)\\b\\w\\w+\\b\\#*') # inclues #* pattern for Twitter hashtag

In [8]:
x_train_counts=count_vect.fit_transform(text_data)
x_train_counts.shape

(14640, 100128)

In [9]:
type(x_train_counts)

scipy.sparse.csr.csr_matrix

In [10]:
#take a look at setting of count vectorizer
count_vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b\\#*',
 'tokenizer': None,
 'vocabulary': None}

## TF-IDF to adjust weights of words based on frequency of the given word

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_counts2=tfidf_transformer.fit_transform(x_train_counts)
x_train_counts2.shape

(14640, 100128)

count_vect.vocabulary_

In [12]:
count_vect.vocabulary_.get('map cool')

57089

# Set up model 

In [13]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier


In [51]:
#uncomment the model to try different training methods.
def test_vectorizer(X):
    #model = MultinomialNB() #Multinomial better than Bernoulli
    model = SGDClassifier(loss='log', penalty='l2',alpha=1e-4, n_iter=10, random_state=42,shuffle=True)
   

    y = text_label

    from sklearn.cross_validation import StratifiedKFold
    skf = StratifiedKFold(y, n_folds=5)


    fold=0
    for train_index, test_index in skf:
        fold+=1
        print (fold)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        ypred=model.predict(X_test)
    return np.mean(y_test==ypred)

In [52]:
test_vectorizer(x_train_counts) #bi-gram with hashtag terms, no TF-IDF

1
2
3
4
5


0.73069036226930961

In [53]:
test_vectorizer(x_train_counts2) #bi-gram with hashtag terms, w/ TF-IDF

1
2
3
4
5


0.67942583732057416

# Add additional features to training data (airline, day of week)

In [54]:
airline=pd.get_dummies(tweets.airline)

In [55]:
airline.head()

Unnamed: 0,American,Delta,Southwest,US Airways,United,Virgin America
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0


In [56]:
#To convert a Series or list-like object of date-like objects e.g. strings, epochs,
#or a mixture, you can use the to_datetime function.
timestamp=pd.to_datetime(tweets.tweet_created)

In [57]:
time=pd.DataFrame({'day': timestamp.dt.dayofweek})
                    
                   

In [58]:
time.head()

Unnamed: 0,day
0,1
1,1
2,1
3,1
4,1


In [59]:
airline_time=pd.concat([airline,time], axis=1)

In [60]:
airline_time.head()

Unnamed: 0,American,Delta,Southwest,US Airways,United,Virgin America,day
0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,0.0,0.0,0.0,0.0,1.0,1
3,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,1.0,1


In [61]:
from scipy.sparse import hstack

In [62]:
X_data1=hstack((airline, x_train_counts)) #no TF-IDF
X_data2=hstack((airline, x_train_counts2)) #with TF-IDF

In [63]:
type(X_data2) #this is in coo matrix, convert to csr

scipy.sparse.coo.coo_matrix

Advantages of the CSC format

        efficient arithmetic operations CSC + CSC, CSC * CSC, etc.
        efficient column slicing
        fast matrix vector products (CSR, BSR may be faster)

Disadvantages of the CSC format

        slow row slicing operations (consider CSR)
        changes to the sparsity structure are expensive (consider LIL or DOK)



Advantages of the COO format

        facilitates fast conversion among sparse formats
        permits duplicate entries (see example)
        very fast conversion to and from CSR/CSC formats

Disadvantages of the COO format

        does not directly support:
                arithmetic operations
                slicing

Intended Usage

        COO is a fast format for constructing sparse matrices
        Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
        By default when converting to CSR or CSC format, duplicate (i,j) entries will be summed together. This facilitates efficient construction of finite element matrices and the like. (see example)



In [64]:
from scipy.sparse import csr_matrix
x_train_data1 =csr_matrix(X_data1) #no TF_IDF
x_train_data2 =csr_matrix(X_data2) #with TF-IDF

In [65]:
test_vectorizer(x_train_data1) #includes day of week, SGDclassifier, no TF-IDF

1
2
3
4
5


0.69548872180451127

In [66]:
test_vectorizer(x_train_data2) #similar to above, but with TF-IDF

1
2
3
4
5


0.64661654135338342