# Text Classification & Sentiment Analysis: Twitter

### Loading Libraries

In [2]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Path
from pathlib import Path

# TextBlob
from textblob import TextBlob

# Scikit-Learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
sns.set_style('white')

warnings.filterwarnings('ignore')

### Twitter Sentiment

#### Downloading Data: Reading & Preprocess `Train/Test` Data

In [4]:
data_path = Path('..', 'data', 'sentiment140')

if not data_path.exists():
    data_path.mkdir(parents=True)

In [5]:
names = ['polarity', 'id', 'date', 'query', 'user', 'text']

In [6]:
def load_train_data():
    parquet_file = data_path / 'train.parquet'
    if not parquet_file.exists():
        df = (pd.read_csv(data_path / 'train.csv',
                          low_memory=False,
                          encoding='latin1',
                          header=None,
                          names=names,
                          parse_dates=['date'])
              .drop(['id', 'query'], axis=1)
              .drop_duplicates(subset=['polarity', 'text']))
        df = df[df.text.str.len() <= 140]
        df.polarity = (df.polarity > 0).astype(int)
        df.to_parquet(parquet_file)
        return df
    else:
        return pd.read_parquet(parquet_file)

In [8]:
train = load_train_data()

train.info(null_counts=True)

In [9]:
def load_test_data():
    parquet_file = data_path / 'test.parquet'
    if not parquet_file.exists():
        df = (pd.read_csv('data/sentiment140/test.csv',
                          low_memory=False,
                          encoding='latin1',
                          header=None,
                          names=names,
                          parse_dates=['date'])
              .drop(['id', 'query'], axis=1)
              .drop_duplicates(subset=['polarity', 'text']))
        df = df[(df.text.str.len() <= 140) &
                (df.polarity.isin([0, 4]))]
        df.to_parquet(parquet_file)
        return df
    else:
        return pd.read_parquet(parquet_file)

In [10]:
test = load_test_data()

test.info(null_counts=True)

### Exploring Data

In [11]:
train.head()

In [12]:
train.polarity = (train.polarity>0).astype(int)

train.polarity.value_counts()

In [13]:
test.polarity = (test.polarity>0).astype(int)

test.polarity.value_counts()

In [14]:
sns.distplot(train.text.str.len(), kde=False)

sns.despine();
plt.show()

In [15]:
train.date.describe()

In [16]:
train.user.nunique()

In [17]:
train.user.value_counts().head(10)

### Creating Text Vectorizer

In [19]:
vectorizer = CountVectorizer(min_df=.001, max_df=.8, stop_words='english')

train_dtm = vectorizer.fit_transform(train.text)

In [20]:
train_dtm

In [21]:
test_dtm = vectorizer.transform(test.text)

### Training Naïve Bayes Classifier

In [22]:
# Placing Model
nb = MultinomialNB()

# Fitting Model
nb.fit(train_dtm, train.polarity)

#### Predicting Test Polarity

In [23]:
predicted_polarity = nb.predict(test_dtm)

#### Evaluating Results

In [24]:
accuracy_score(test.polarity, predicted_polarity)

### TextBlob for Sentiment Analysis

In [25]:
sample_positive = train.text.loc[256332]
print(sample_positive)

parsed_positive = TextBlob(sample_positive)
parsed_positive.polarity

In [26]:
sample_negative = train.text.loc[636079]
print(sample_negative)

parsed_negative = TextBlob(sample_negative)
parsed_negative.polarity

In [27]:
def estimate_polarity(text):
    return TextBlob(text).sentiment.polarity

In [28]:
train[['text']].sample(10).assign(sentiment=lambda x: x.text.apply(estimate_polarity)).sort_values('sentiment')

### Comparing with TextBlob Polarity Score

In [29]:
test['sentiment'] = test.text.apply(estimate_polarity)

In [30]:
accuracy_score(test.polarity, (test.sentiment>0).astype(int))

#### ROC AUC Scores

In [31]:
roc_auc_score(y_true=test.polarity, y_score=test.sentiment)

In [32]:
roc_auc_score(y_true=test.polarity, y_score=nb.predict_proba(test_dtm)[:, 1])

In [33]:
fpr_tb, tpr_tb, _ = roc_curve(y_true=test.polarity, y_score=test.sentiment)
roc_tb = pd.Series(tpr_tb, index=fpr_tb)

fpr_nb, tpr_nb, _ = roc_curve(y_true=test.polarity, y_score=nb.predict_proba(test_dtm)[:, 1])
roc_nb = pd.Series(tpr_nb, index=fpr_nb)

In [34]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 6))

sns.boxplot(x='polarity', y='sentiment', data=test, ax=axes[0])
axes[0].set_title('TextBlob Sentiment Scores')
roc_nb.plot(ax=axes[1], label='Naive Bayes', legend=True, lw=1, title='ROC Curves')
roc_tb.plot(ax=axes[1], label='TextBlob', legend=True, lw=1)
sns.despine()
fig.tight_layout();
plt.show()