In [None]:
import csv
from datetime import datetime, timedelta
from calendar import monthrange
from os.path import exists
import pandas as pd
import numpy as np
import re
import logging

import logging
logging.basicConfig(level=logging.DEBUG)

In [None]:
def generate_baseline_dates(year, seed):
    rng = np.random.default_rng(seed)

    return [
        '{year}-{month:02}-{day:02}'.format(
            year=year,
            month=month,
            day=rng.integers(1, monthrange(year, month)[1], size=1, endpoint=True)[0]
        )
        for month in range(1, 12 + 1)
    ]

# Generated with generate_baseline_dates(2018, 12345)
baseline_dates = [
    '2018-01-22',
    '2018-02-07',
    '2018-03-25',
    '2018-04-10',
    '2018-05-07',
    '2018-06-24',
    '2018-07-20',
    '2018-08-21',
    '2018-09-30',
    '2018-10-13',
    '2018-11-26',
    '2018-12-11'
]

In [None]:
keywords = ['family']
keywords = {keyword: f'my {keyword}' for keyword in keywords}

# Dates per holiday
holidays = {
    'Thanksgiving': ['2018-11-22', '2018-11-23'],
    'Christmas': ['2018-12-24', '2018-12-25', '2018-12-26'],
    'New Year\'s': ['2018-12-31', '2019-01-01'],
    'Baseline': baseline_dates
}

# Scraping

In [None]:
!pip install snscrape

In [None]:
from snscrape.modules.twitter import TwitterSearchScraper

In [None]:
def format(tweet):
    return [tweet.id, tweet.date.strftime('%Y-%m-%d %H:%M'), tweet.user.id, tweet.content.replace('\n', '\\n')]

In [None]:
def scrape(path, keywords, date_start, date_end, limit = -1):
    with open(path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        count = 0

        for i, tweet in enumerate(TwitterSearchScraper(
            '(' + ' OR '.join(keywords) + ') lang:en until:' + date_end.strftime('%Y-%m-%d') + ' since:' + date_start.strftime('%Y-%m-%d') + ''
        ).get_items()):
            if limit != -1 and i >= limit:
                break

            writer.writerow(format(tweet))

            count = i + 1
            if count % 100 == 0:
                print('\r', '{0} tweets scraped'.format(count), end='')

        print('')

In [None]:
for label, keyword in keywords.items():
    for _, dates in holidays.items():
        for date in dates:
            datetime = datetime.strptime(date, '%Y-%m-%d')

            file = f'data/tweets/{label}.tweets.{date}.tsv'
            if exists(file):
                continue

            print('Starting: {0}'.format(date))

            scrape(
                file,
                [keyword],
                datetime,
                datetime + timedelta(1)
            )

            print('Done')

# Filtering and inspection

In [None]:
def load(include_labels=False):
    data = []

    for key in keywords.keys():
        for holiday, dates in holidays.items():
            for date in dates:
                tweets_path = f'data/tweets/{key}.tweets.{date}.tsv'
                tweets = pd.read_csv(tweets_path, names=['id', 'date_time', 'user_id', 'text'], sep='\t').set_index('id')
                tweets['text'] = tweets['text'].apply(lambda text: text.replace('\\n', '\n'))
                tweets['date'] = tweets['date_time'].apply(lambda x: x.split(' ')[0])
                tweets['holiday'] = holiday

                if include_labels:
                    labels_path = f'data/tweets/{key}.labels.{date}.tsv'
                    labels = pd.read_csv(labels_path, names=['id', 'label'], sep='\t').set_index('id')
                    tweets = tweets.join(labels)
        
                data.append(tweets)

    return pd.concat(data)

In [None]:
def filter(data, day_threshold=2, duplicate_n_day_threshold=2):
    keyword_pattern = '(?:' + '|'.join(['[^a-z]*'.join([f'#?{keyword}' for keyword in phrase.split(' ')]) for phrase in keywords.values()]) + ')'
    quotes = '"“”'

    # Filter out tweets not containing any keywords
    remove = ~data['text'].str.contains(keyword_pattern, case=False)
    data = data[~remove]
    logging.info(f'Filtering {sum(remove)} tweets not containing keywords')
    
    # Filter out tweets where the keywords only occurs within quotes,
    # e.g. if the keywords are not preceded by an even (or zero) number of quotes
    remove = ~data['text'].str.contains(f'^(?:(?:[^{quotes}]*[{quotes}]){{2}})*[^{quotes}]*{keyword_pattern}', case=False)
    data = data[~remove]
    logging.info(f'Filtering {sum(remove)} tweets with keywords within quotes')

    # Filter out tweets that are duplicate on a day for a user
    before = data.shape[0]
    data = data.sort_values('date_time').groupby(['date', 'user_id', 'text']).head(1)
    after = data.shape[0]
    logging.info(f'Filtering {before - after} tweets duplicate for a day')

    # Filter out tweets that are posted by a user on more than duplicate_n_day_threshold dates
    remove = data['user_id'].isin(set(
        data
            .groupby(['user_id', 'text'])
            .filter(lambda x: x['date'].count() > duplicate_n_day_threshold)['user_id']
    ))
    data = data[~remove]
    logging.info(f'Filtering {sum(remove)} tweets from a user posting duplicated tweets over the duplicate_n_day_threshold')

    # Filter out tweets that are not a user's first per_day_threshold tweets of a day
    before = data.shape[0]
    data = data.sort_values('date_time').groupby(['date', 'user_id']).head(day_threshold)
    after = data.shape[0]
    logging.info(f'Filtering {before - after} tweets over the per_day_threshold')

    return data

In [None]:
data = load()
data.sample(1000, random_state=12345).to_csv('data/sample.inspection.100.tsv', sep='\t', header=False, index=False)

# Classification

In [None]:
%load_ext autoreload
%autoreload 1
%aimport classify_sentiment
from classify_sentiment import SentimentClassification

In [None]:
classification = SentimentClassification(
    temp_dir='data/temp',
)
classification.load_pipeline('models/classifier.pickle')

In [None]:
data = filter(load())

In [None]:
for key, keyword in keywords.items():
    for _, dates in holidays.items():
        for date in dates:
            labels_path = f'data/tweets/{key}.labels.{date}.tsv'
            if exists(labels_path):
                continue
            
            chunk = data[data['date'] == date].reset_index()[['id', 'text']].copy()

            logging.info(f'Classifying: {key}/{date} ({chunk.shape[0]} tweets)')

            chunk['label'] = classification.predict(chunk['text'])
            chunk[['id', 'label']].to_csv(labels_path, sep='\t', header=False, index=False)

# Presentation

In [None]:
!pip install matplotlib
!pip install seaborn
!pip install tikzplotlib

In [None]:
data = filter(load(include_labels=True))

In [None]:
results = pd.DataFrame()

label_map = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

for key, keyword in keywords.items():
    for holiday, dates in holidays.items():
        for date in dates:
            labels_path = f'data/tweets/{key}.labels.{date}.tsv'
            labels = pd.read_csv(labels_path, names=['id', 'label'], sep='\t')[['label']]
    
            counts = labels.value_counts()

            results = pd.concat([
                results, 
                pd.DataFrame([{
                    'date': date,
                    'holiday': holiday,
                    **{f'{label}_count': counts.loc[(k,)] for k, label in label_map.items()}
                }]).set_index('date')
            ])

results_dates = results
results_dates['n_tweets'] = sum([results_dates[f'{label}_count'] for label in label_map.values()])
for label in label_map.values():
    results_dates[label] = round(results_dates[f'{label}_count'] / results_dates['n_tweets'] * 100, 2)

results_holidays = results_dates
results_holidays_groups = results_holidays.reset_index().groupby(['holiday'], sort=False)
results_holidays = results_holidays_groups.agg({
    **{f'{label}_count': 'sum' for label in label_map.values()},
    **{f'{label}': ['min', 'max'] for label in label_map.values()},
    'n_tweets': ['sum', 'min', 'max', 'mean']
})
results_holidays.columns = results_holidays.columns.get_level_values(0) + '_' + results_holidays.columns.get_level_values(1)
results_holidays = results_holidays.rename(columns={
    **{f'{label}_count_sum': f'{label}_count' for label in label_map.values()},
    'n_tweets_sum': 'n_tweets',
    'n_tweets_mean': 'n_tweets_avg',
})

for label in label_map.values():
    results_holidays[label] = round(results_holidays[f'{label}_count'] / results_holidays['n_tweets'] * 100, 2)

results_holidays['n_dates'] = results_holidays_groups.size()

results_holidays

In [None]:
import seaborn as sns
import matplotlib as mpl
mpl.use('pgf')
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

## Figure

In [None]:
sns.set()
fig, ax = plt.subplots(ncols=2, sharey=True)

x = np.arange(results_holidays.shape[0])
width = 0.75/len(label_map)

# Sentiment distribution
for i, (key, label) in enumerate(reversed(label_map.items())):
    ax[0].barh(
        x + (i - (len(label_map) -1) / 2) * width,
        results_holidays[label],
        height=width,
        label=label,
        xerr=np.stack((
            results_holidays[label] - results_holidays[f'{label}_min'], 
            results_holidays[f'{label}_max'] - results_holidays[label]
        )),
        capsize=2
    )

ax[0].set_title('Sentiment distribution')
ax[0].set_xlabel('\\% of tweets')
ax[0].set_ylabel('')
ax[0].yaxis.set_ticks(x, holidays.keys())
ax[0].tick_params(axis='both', which='both', length=0)
ax[0].legend()
ax[0].xaxis.get_major_ticks()[0].set_visible(False)
ax[0].set_xlim(100, 0)
ax[0].invert_yaxis()

# Total number
ax[1].barh(
    x,
    results_holidays['n_tweets_avg'],
    height=.4, color='r',
    xerr=np.stack((
        results_holidays['n_tweets_avg'] - results_holidays[f'n_tweets_min'], 
        results_holidays[f'n_tweets_max'] - results_holidays['n_tweets_avg']
    )),
    capsize=2
)

ax[1].set_title('Tweets per day')
ax[1].set_xlabel('\\# of tweets')
ax[1].set_ylabel('')
ax[1].tick_params(axis='both', which='both', length=0)

fig.tight_layout()
plt.subplots_adjust(wspace=0)
plt.savefig('data/results.pgf', format='pgf')
# plt.show()

## Results aggregated per holiday

In [None]:
lines = []

lines.append('\\begin{tabular}{r|ccc|ccc|ccc|cccc}')
lines.append('\\toprule')

lines.append(' & '.join([
    '',
    *[
        f'\\multicolumn{{3}}{{|c}}{{\\% {label}}}'
        for label in label_map.values()
    ],
    '\\multicolumn{4}{|c}{\\# tweets}',
]) + '\\\\')
lines.append(' & '.join([
    'Holiday',
    *[
        'of total',
        'min.',
        'max.'
    ] * 3,
    'total',
    'avg.',
    'min.',
    'max.',
]) + '\\\\')

lines.append('\\midrule')

for holiday in holidays.keys():
    lines.append('\t' + ' & '.join([
        holiday,
        *[
            '{0:.2f}'.format(results_holidays.loc[holiday][f'{label}{suffix}'])
            for label in label_map.values()
            for suffix in ['', '_min', '_max']
        ],
        *[
            '{0:.0f}'.format(results_holidays.loc[holiday][f'n_tweets{suffix}'])
            for suffix in ['', '_avg', '_min', '_max']
        ],
    ]) + '\\\\')

lines.append('\\bottomrule')
lines.append('\\end{tabular}')

table = '\n'.join(lines)

print(table)

## Results aggregated per holiday (vertical)

In [None]:
lines = []

lines.append(f'\\begin{{tabular}}{{r|{"c" * len(holidays)}}}')
lines.append('\\toprule')

lines.append(' & '.join([
    '',
    *[
        f'\\makebox[1em][l]{{\\rotatebox{{45}}{{{holiday}}}}}'
        for holiday in holidays.keys()
    ],
]) + '\\\\')

lines.append('\\midrule')

grouped_rows = {
    **{
        label.capitalize(): {
            '\\% of total': (label, lambda x: '{0:.2f}'.format(x)),
            'Min. \\%': (f'{label}_min', lambda x: '{0:.2f}'.format(x)),
            'Max. \\%': (f'{label}_max', lambda x: '{0:.2f}'.format(x)),
        }
        for label in ['positive', 'neutral', 'negative']
    },
    '\\# tweets': {
        'Total': ('n_tweets', lambda x: '{0:,.0f}'.format(x)),
        'Avg.': ('n_tweets_avg', lambda x: '{0:,.0f}'.format(x)),
        'Min.': ('n_tweets_min', lambda x: '{0:,.0f}'.format(x)),
        'Max.': ('n_tweets_max', lambda x: '{0:,.0f}'.format(x)),
    }
}

for i, (header, rows) in enumerate(grouped_rows.items()):
    if i != 0:
        lines.append('\\midrule')

    lines.append(f'\\multicolumn{{{len(holidays) + 1}}}{{c}}{{{header}}}\\\\')
    lines.append('\\midrule')

    for label, (key, format) in rows.items():
        lines.append('\t' + ' & '.join([
            label,
            *[
                format(results_holidays.loc[holiday][key])
                for holiday in holidays.keys()
            ]
        ]) + '\\\\')

lines.append('\\bottomrule')
lines.append('\\end{tabular}')

table = '\n'.join(lines)

print(table)

## Results expanded per date

In [None]:
lines = []

lines.append('\\begin{tabular}{r|cccc}')
lines.append('\\toprule')

lines.append(' & '.join([
    'Date',
    '\\% neg.',
    '\\% neu.',
    '\\% pos.',
    '\\# tweets',
]) + '\\\\')

lines.append('\\midrule')

for i, (holiday, dates) in enumerate(holidays.items()):
    if i != 0:
        lines.append('\\midrule')

    lines.append(f'\\multicolumn{{5}}{{c}}{{{holiday}}}\\\\')
    lines.append('\\midrule')

    for date in dates:
        lines.append('\t' + ' & '.join([
            date,
            '{0:.2f}'.format(results_dates.loc[date]['negative']),
            '{0:.2f}'.format(results_dates.loc[date]['neutral']),
            '{0:.2f}'.format(results_dates.loc[date]['positive']),
            '{}'.format(results_dates.loc[date]['n_tweets']),
        ]) + '\\\\')

lines.append('\\bottomrule')
lines.append('\\end{tabular}')

table = '\n'.join(lines)

print(table)

# Validation

In [None]:
from classify_sentiment import evaluate

## Sample validation tweets

In [None]:
data = filter(load(include_labels=True))
data['manual_label'] = '###'
data.sample(100, random_state=123456)[['label', 'manual_label', 'text']].to_csv('data/sample.validation.100.tsv', sep='\t', header=False)

## Load and evaluate manually labeled validation tweets

In [None]:
results = pd.read_csv('data/sample.validation.labeled.100.tsv', names=['id', 'predicted', 'manual', 'text'], dtype={'predicted': np.int32, 'manual': np.int32}, sep='\t')[['predicted', 'manual']]
print(evaluate(list(results['manual']), list(results['predicted'])))

## Confusion matrix

In [None]:
lines = []

lines.append('\\begin{tabular}{l|ccc}')
lines.append('\\toprule')

lines.append(' & '.join([
    '',
    '\\multicolumn{3}{c}{Manual}',
]) + '\\\\')
lines.append(' & '.join([
    'Classifier',
    *[label_manual.capitalize() for label_manual in reversed(label_map.values())],
]) + '\\\\')

lines.append('\\midrule')

for key_classifier, label_classifier in reversed(label_map.items()):
    lines.append('\t' + ' & '.join([
        label_classifier.capitalize(),
        *[
            str(sum((results['predicted'] == key_classifier) & (results['manual'] == key_manual)))
            for key_manual in reversed(label_map.keys())
        ],
    ]) + '\\\\')

lines.append('\\bottomrule')
lines.append('\\end{tabular}')

table = '\n'.join(lines)

print(table)