# Sentiment Analysis with TextBlob

# Imports

In [None]:
import numpy as np
import pandas as pd
import keras
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from sklearn import metrics
import time



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Read files
Execute only one of these

## Apple Sentiment

In [None]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## US Airline Sentiment

In [None]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename({'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": -1,
  "neutral": 0,
  "positive": 1
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## T4SA

In [None]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv(".data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]
thisdict =	{
  "NEG": -1,
  "NEU": 0,
  "POS": 1
}

data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


General Text Cleaning

In [None]:
_, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_test.text = df_test.text.str.lower()

df_test.text = df_test.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
df_test.text = df_test.text.apply(lambda x: tokenizer.tokenize(x))

df_test.text = df_test.text.apply(lambda x: ' '.join(x))

df_test.text = df_test.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

df_test.text = df_test.text.str.replace("[0-9]", " ")

df_test.text = df_test.text.str.strip(string.whitespace)

df_test = df_test.reset_index(drop=True)


In [None]:
df_test.head()

Unnamed: 0,text,sentiment
0,fundraiser at the greene turtle football famil...,0
1,fantastic shot of two of our top talents in ac...,1
2,all eyes are on elclasico but heres all the fo...,0
3,happy th birthday grandpa missing you more...,1
4,china july industrial profits rise buoyed by ...,0


# Calculation

In [None]:
# Insert column with newly calculated sentiments
start = time.time()
df_test.insert(2, "blobpolarity", df_test.text.map(lambda x: int(round(TextBlob(x).sentiment.polarity))), True)
print("Time elapsed: ", (time.time() - start) / 60)


Time elapsed:  3.111138407389323


In [None]:
df_test.head()

0
0
Sentiment(polarity=0.10833333333333334, subjectivity=0.3962962962962963)


Unnamed: 0,Polarity,tweet,blobpolarity
0,0,awww thats a bummer you shoulda got david car...,0
1,0,is upset that he cant update his facebook by t...,0
2,0,i dived many times for the ball managed to sav...,0
3,0,my whole body feels itchy and like its on fire,0
4,0,no its not behaving at all im mad why am i her...,-1


In [None]:
df_test.sentiment.unique()

array([ 0,  1, -1])

## Evaluation

In [None]:
print(metrics.f1_score(df_test.sentiment, df_test.blobpolarity, average = None,))
print(metrics.accuracy_score(df_test.sentiment, df_test.blobpolarity))

[0.1846541  0.75758194 0.54274858]
0.661133168629586


In [None]:
df_test.sentiment.value_counts()

 0    273
-1    222
 1     43
Name: sentiment, dtype: int64