# <u> Sentiment Analysis On Twitter Posts <u>

### Preparing The Data

In [17]:
import pandas as pd

In [18]:
data_df = pd.read_csv("sentiment.csv", quotechar='"', encoding= "ISO-8859-1")
data_df.shape

(1600000, 5)

In [19]:
data_df=data_df.drop('Topic',1)
data_df=data_df.drop('TweetId',1)
data_df=data_df.drop('TweetDate',1)
data_df.shape

(1600000, 2)

In [20]:
data_df.columns = ["Sentiment","TweetText"]
positive_sentiment = data_df[data_df['Sentiment']=='positive']
negative_sentiment = data_df[data_df['Sentiment']=='negative']

In [21]:
val = 100000
frames = [positive_sentiment[:val], negative_sentiment[:val]]
data_df = pd.concat(frames)
data_df.shape

(200000, 2)

### <u>Exploring the Data<u>

In [22]:
data_df.head()

Unnamed: 0,Sentiment,TweetText
800000,positive,I LOVE @Health4UandPets u guys r the best!!
800001,positive,im meeting up with one of my besties tonight! ...
800002,positive,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,positive,Being sick can be really cheap when it hurts t...
800004,positive,@LovesBrooklyn2 he has that effect on everyone


In [23]:
data_df.Sentiment.value_counts()

positive    100000
negative    100000
Name: Sentiment, dtype: int64

In [24]:
import numpy as np
print("Average # words per post: ",np.mean([len(s.split(" ")) for s in data_df.TweetText]))

Average # words per post:  14.379525


In [25]:
test_set_length = int(0.3*(len(data_df)))
training_set_length = int((len(data_df)) - test_set_length)
print(test_set_length)
print(training_set_length)
print(training_set_length +test_set_length)

60000
140000
200000


In [26]:
training_set = data_df[0:training_set_length]
test_set = data_df[training_set_length:]
print("training_set shape: ",training_set.shape)
print("test_set shape: ",test_set.shape)

training_set shape:  (140000, 2)
test_set shape:  (60000, 2)


In [27]:
del data_df

### <u> Tokenizing and Stemming <u>

#### Stemming is the process for reducing derived words to their base or root form 
##### Ex: 
 - "fishing", "fished", and "fisher" == fish
 - "argue", "argued", "argues", "arguing" == argu
    

In [28]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [29]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [30]:
import re as regex , nltk
def tokenize(text):
    # remove non letters
    text = regex.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

### <u> Feature Extraction <u>

|       |Love |Guns | Taylor swift| Sick |Lonely | Excited | bed |Thanks |
|-------|:---:|:---:|:-----------:|:----:|:-----:|:-------:|:---:|:-----:|     
|tweet 1| 3   | 0   | 1           |0     |0      | 1       |1    |2      |
|tweet 2| 0   | 1   |0            |2     | 1     |1        |0    |0      |
|tweet 3| 4   | 0   |0            | 1    |  0    | 0       | 0   | 0     |
|tweet 4| 0   | 1   |0            |0     |2      |1        |2    |0      |
|tweet 5|0    |0    |2            |1     |1      |2        |0    |0      |
|tweet 6|1    |0    |0            |0     |0      |0        |0    |0      |

In [31]:
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize, # tokenize is a user defined function declared above
    lowercase = True,
    stop_words = 'english',
    #max_features = 200
)

In [32]:
features_matrix = vectorizer.fit_transform(training_set.TweetText.tolist() + test_set.TweetText.tolist())
features_matrix.shape


(200000, 118734)

#### When we run Feature Extraction on 1.6 million tweets we get  <u> 533,386 features <u> (distinct words)

## <u> Training and Testing <u>

In [33]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        features_matrix, 
        training_set.Sentiment.tolist() + test_set.Sentiment.tolist(),
        test_size=0.30, 
        random_state=3)

In [34]:
del training_set
del test_set

### <u> Logistic Regression <u>

In [35]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
del log_model
del y_pred

             precision    recall  f1-score   support

   negative       0.77      0.75      0.76     29865
   positive       0.76      0.78      0.77     30135

avg / total       0.76      0.76      0.76     60000



### <u>Decision Tree<u>

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)

In [29]:
print(classification_report(y_test, clf_predictions))
del clf
del clf_predictions

             precision    recall  f1-score   support

   negative       0.69      0.72      0.70       29865
   positive       0.71      0.68      0.70       30135

avg / total       0.70      0.70      0.70       60000



### <u> Random Forest <u>

In [30]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=40)
clf = clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)

In [31]:
print(classification_report(y_test, clf_predictions))
del clf
del clf_predictions

             precision    recall  f1-score   support

   negative       0.73      0.78      0.75       29865
   positive       0.77      0.72      0.74       30135

avg / total       0.75      0.75      0.75       60000



### <u> Bernoulli Naive bayes<u>

In [32]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)

In [33]:
print(classification_report(y_test, clf_predictions))
del clf
del clf_predictions

             precision    recall  f1-score   support

   negative       0.74      0.80      0.77       29865
   positive       0.78      0.72      0.75       30135

avg / total       0.76      0.76      0.76       60000



### <u> ADA Boost <u>

In [34]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)

In [35]:
print(classification_report(y_test, clf_predictions))
del clf
del clf_predictions

             precision    recall  f1-score   support

   negative       0.77      0.50      0.61       29865
   positive       0.63      0.85      0.73       30135

avg / total       0.70      0.68      0.67       60000



### Results obtained while running the program on 1,600,000

|Classifier | F1score |
|-----------|:-------:|
|Logistic regression| 78% |
|Bernoulli NB | 76% |

## <u> Future Work <u>
 

#### 1.  Experiment with feature extraction
        - try bigrams, trigrams etc.
        - tf (term frequency), and tf-idf (Term Frequency times Inverse Document Frequency)
#### 2.  Try feature Selection (consider only Adjectives, ignore prepositions and other junk words)
#### 3.  A more Thorough Tuning  of the Classifier parameters
#### 4.  Try other classifiers