# Sentiment Analysis With Gradient Boost

## Imports

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.sentiment.util import mark_negation
from nltk.tokenize.casual import TweetTokenizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/leander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Data

In [2]:
df=pd.read_csv('./assets/datasets/sentiment.csv',encoding='latin-1')
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "TweetText"]
df.columns = DATASET_COLUMNS
df.head()

Unnamed: 0,target,ids,date,flag,user,TweetText
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Replace urls,usernames ..

In [6]:
import re

vocab=[]
punctuations=['.','*','!']
for x in df["TweetText"]:
    x=re.sub("http://.* ",'URL ',x)
    x=re.sub("#.* ",'HASHTAG ',x)
    x=re.sub("@.* ",'USER ',x)
    for z in punctuations:
        x=x.replace(z,' ')
    vocab.extend(x.split(' '))

## To be used to classify into 3 classes

In [9]:
pos=[]
neg=[]
neutral=[]
for x in df["target"]:
    if x == 0:
        neg.append(1)
        pos.append(0)
        neutral.append(0)
    if x == 2:
        neg.append(0)
        pos.append(0)
        neutral.append(1)
    if x == 4:
        neg.append(0)
        pos.append(1)
        neutral.append(0)

## Negation Marking and Stopword removal

In [11]:
stopwords=set(nltk.corpus.stopwords.words('english'))
tokenizer=TweetTokenizer(reduce_len=True,strip_handles=True,preserve_case=False)
X=[]
for i in df["TweetText"]:
    X.append(tokenizer.tokenize(i))
for i in range(len(X)):
    X[i]=mark_negation(X[i])

In [12]:
neg_stop=[]
for i in stopwords:
    neg_stop.append(i+"_NEG")
stopwords=set(nltk.corpus.stopwords.words('english'))
stop=list(stopwords)
stop.extend(neg_stop)

In [13]:
for i in range(len(X)):
    X[i]=[t for t in X[i] if t not in stop]

In [14]:
corpus=[]
for i in X:
    s=""
    for j in i:
       s+=(j+" ")
    corpus.append(s)

## Count Vectorizing for bag of Words Features

In [58]:
cv = CountVectorizer(max_features=7000)
count_vector=cv.fit_transform(corpus)

## Using the model

In [59]:
X_train, X_test, y_train, y_test = train_test_split(count_vector, df["target"], test_size=0.5, random_state=42)

In [60]:
xgb_clf = XGBClassifier(silent=False,
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.8,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=2000, 
                      reg_alpha = 0.1,
                      max_depth=4, 
                      gamma=5,
                       )

In [61]:
xgb_clf.fit(X_train, y_train, verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=5,
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=2000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=False, subsample=0.8, verbosity=1)

In [None]:
score = xgb_clf.score(X_train, y_train)
print(score)
score = xgb_clf.score(X_test, y_test)
print(score)

0.7190283987854985


## Target Dataset

In [8]:
df2=pd.read_csv('./assets/datasets/tweet_data.csv')

for x in df2["text"]:
    x=re.sub("http://.* ",'URL ',x)
    x=re.sub("#.* ",'HASHTAG ',x)
    x=re.sub("@.* ",'USER ',x)
    for z in punctuations:
        x=x.replace(z,' ')
    vocab.extend(x.split(' '))

vocab=set(vocab)

corpus=[]
for x in df["TweetText"]:
    x=re.sub("http://.* ",'URL ',x)
    x=re.sub("#.* ",'HASHTAG ',x)
    x=re.sub("@.* ",'USER ',x)
    for z in punctuations:
        x=x.replace(z,' ')
    corpus.append(x)

## Saving Model and Other Objects(For fututre training)

In [None]:
import pickle

outfile = open('./saved_models/XGBoost','wb')
pickle.dump({
            'model': xgb_clf,
            'count_vector': count_vector,
            'stop_words':stop,
            'corpus':corpus
            },outfile)
outfile.close()