# Sentiment140 preprocessing and feature extraction

In [8]:
import dask
import dask.dataframe
import dask.diagnostics
import dask.multiprocessing
import langid
import os
import pandas as pd
from urllib.request import urlretrieve
import spacy
import ssl
import sys
from zipfile import ZipFile
from tqdm import tqdm

LOCAL_PATH = '../data/sentiment140'
PATH_TO_ARCHIVE = os.path.join(LOCAL_PATH, 'archive.zip')

os.makedirs(LOCAL_PATH, exist_ok=True)

## Download archive

In [2]:
with tqdm() as progress:
    def report(count, blockSize, totalSize):
        progress.total = totalSize
        progress.update(blockSize)
    ssl._create_default_https_context = ssl._create_unverified_context
    urlretrieve('http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip',
                PATH_TO_ARCHIVE, reporthook=report)

81379328it [00:38, 2089823.28it/s]                              


## Extract and tidy data

In [3]:
with ZipFile(PATH_TO_ARCHIVE) as zip_file:
    df = pd.read_csv(zip_file.open('training.1600000.processed.noemoticon.csv'),
                    encoding='latin1', names=['target', 'id', 'date', 'flag', 'user', 'tweet'])

df['sentiment'] = df['target'].map({0: 'negative',
                                    2: 'neutral',
                                    4: 'positive'})

df = df[['sentiment', 'user', 'tweet']]

## Filter non-English tweets

In [4]:
dd = dask.dataframe.from_pandas(df, npartitions=100)
with dask.diagnostics.ProgressBar():
    df['langid'] = dd.map_partitions(
        lambda dataframe: dataframe.apply(
            (lambda row: langid.classify(row['tweet'])[0]), axis=1)) \
        .compute(get=dask.multiprocessing.get)

df = df[df['langid'] == 'en'].reindex()

[########################################] | 100% Completed | 15min 41.5s


## Process with spaCy

In [5]:
nlp = spacy.load('en_core_web_lg')

In [10]:
df['tokens'] = list(tqdm((
    [(token.lemma_, token.pos_) for token in doc]
    for doc in nlp.pipe(df['tweet'], disable=['parser', 'ner'])), total=len(df)))

In [20]:
df.to_csv(os.path.join(LOCAL_PATH, 'tokens.csv'))