In [1]:
from __future__ import division
from __future__ import print_function
import csv
from datetime import datetime
import os
import re

import pandas
from sklearn.feature_extraction.text import CountVectorizer

# Open csv

In [2]:
# Open with Pandas, load into DataFrame

# TODO: Parse dates correctly; this is close but not working
date_parser = lambda x: pandas.datetime.strptime(x, '%a %b %d %H:%M:%S +z %Y')  # Mon Feb 15 20:44:33 +0000 2016

popular_df = pandas.read_csv('tweets/tweets_popular.csv', 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False, 
                             parse_dates=True,
                             date_parser=date_parser
                            )

In [3]:
# Let's inspect our data
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

Shape: (4679, 3)
Columns: Index([u'_text', u'_rt_count', u'_tweet_datetime'], dtype='object')


In [4]:
# Look at columns
print('Text column')
print(popular_df['_text'][:4])  # the underscores will be apparent later

Text column
0    @CringeLMAO: Easy there m8 https://t.co/dnF3Wq...
1    @AustinMahone: Just posted a photo https://t.c...
2    @Ashton5SOS: Some days I drink way to much cof...
3    @lailamuhammad: When you nail that #Beyonc   m...
Name: _text, dtype: object


In [5]:
# Look at columns
print('Retweet count')
print(popular_df['_rt_count'][:4])

Retweet count
0     2084
1     1059
2    24121
3      801
Name: _rt_count, dtype: int64


In [6]:
# Look at columns
print('Date-time')
print(popular_df['_tweet_datetime'][:4])

Date-time
0    Mon Feb 15 20:44:33 +0000 2016
1    Mon Feb 15 20:44:33 +0000 2016
2    Mon Feb 15 20:44:33 +0000 2016
3    Mon Feb 15 20:44:33 +0000 2016
Name: _tweet_datetime, dtype: object


In [7]:
# Let's look at the parsed date-time
# TODO: Try to parse this right
dt = popular_df['_tweet_datetime'][0]
print(type(dt))

<type 'str'>


In [8]:
# let's do this again, but with a function
del popular_df  # rm large object from memory

In [9]:
def csv_to_df(csv_file):
    """Open csv, return Pandas DataFrame."""
    dataframe = pandas.read_csv(csv_file, 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False, 
                             parse_dates=True,
                             date_parser=date_parser
                            )
    return dataframe

In [10]:
dataframe_popular = csv_to_df('tweets/tweets_popular.csv')

# Data cleanup

## Remove duplicate rows

In [11]:
# rows, columns
print('Shape before:', dataframe_popular.shape)

popular_df = dataframe_popular.drop_duplicates()
print('Shape after:', dataframe_popular.shape)

Shape before: (4679, 3)
Shape after: (4679, 3)


## Other cleanup?

In [12]:
## Todo

# case, space, some punctuation, etc

# Feature extraction

## Word tokenization

Show plain function, maybe NLTK too

In [13]:
# A basic tokenizer

def tokenize_words(input_string):
    """Take a string, return a list of 
    strings broken on whitespace, but do 
    not break @mentions and URLs.
    """
    punctuation = [',', '!', '"', '. ', ': ']
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return [w for w in input_string.split(' ') if w]  # rm empty strings

In [14]:
# See @users and http: not split
a_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(a_tweet)

['@CuteEmergency', "I'm", 'okay', 'https://t.co/TWMwjG03Fd']

## Counting text

* count chars
* count words
* links
* count links
* #hashtags
* count #hashtags
* @mentions
* count @mentions

In [15]:
def get_urls(input_tokens):
    """Check incoming list of strings, check if token
    starts with `http(s)://`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('http')]`
    """
    urls = []
    for word in input_tokens:
        if word.startswith('http'):
            urls.append(word)
    return urls

In [16]:
def get_hashtags(input_tokens):
    """Check incoming list of strings, check if token
    starts with `#`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('#')]`
    """
    hashtags = []
    for word in input_tokens:
        if word.startswith('#'):
            hashtags.append(word)
    return hashtags

In [17]:
def get_mentions(input_tokens):
    """Check incoming list of strings, check if token
    starts with `@`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('@')]`
    """
    mentions = []
    for word in input_tokens:
        if word.startswith('@'):
            mentions.append(word)
    return mentions

In [18]:
def add_features_to_df(dataframe):
    """Take DataFrame of tweets, extract some specific 
    features and add to returned DataFrame.
    """
    #tokens = []  # list of strings
    char_count = []
    word_count = []
    urls = []
    hashtags = []
    mentions = []

    for i, row in dataframe.iterrows():
        
        # Text and tokens
        tokens = tokenize_words(row['_text'])
        char_count.append(len(row['_text']))
        word_count.append(len(tokens))

        # URLs
        url_list = get_urls(tokens)
        urls.append(url_list)
        url_count = len(url_list)

        # Hashtags
        hashtag_list = get_hashtags(tokens)
        hashtags.append(hashtag_list)
        hashtag_count = len(hashtag_list)

        # Mentions
        mentions_list = get_mentions(tokens)
        mentions.append(mentions_list)
        mentions_count = len(mentions_list)

    #dataframe['_tokens'] = tokens
    dataframe['_char_count'] = char_count
    dataframe['_word_count'] = word_count
    dataframe['_urls'] = urls
    dataframe['_url_count'] = url_count
    dataframe['_hashtags'] = hashtags
    dataframe['_hashtag_count'] = hashtag_count
    dataframe['_mentions'] = hashtags
    dataframe['_mentions_count'] = mentions_count
    
    return dataframe

In [19]:
dataframe_popular = add_features_to_df(dataframe_popular)

In [20]:
print('Shape:', dataframe_popular.shape)
print('Columns:', dataframe_popular.columns)
print(dataframe_popular.columns)

Shape: (4679, 11)
Columns: Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object')
Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object')


## TODO: Extract from datetime

Our times are not very diverse, so will not useful for the feature table in this data set

## TODO: Named entity recognition (NER)

Maybe show NLTK code, but don't do, too slow

In [21]:
# Make sure 'feature_tables' present
features_dir = 'feature_tables'
if not os.path.isdir(features_dir):
    os.mkdir(features_dir)

# Write feature table to disk
dataframe_popular.to_csv('feature_tables/popular_basics.csv', sep='|', encoding='utf-8')

# Bag of words

Helpful links:
* <http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation>
* <https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words>

In [22]:
def make_merge_bow(dataframe, save_path):
    """Take a dataframe, extract '_text' and make a Bag of Words.
    Write BoW features to their own file, then merge with input
    and return new dataframe.
    
    TODO: Revisit options for CountVectorizer() (lowercase, tokenizer, min freq)
    """
    # Get list of strings, for input into vectorizer
    text_list = dataframe['_text'].tolist()

    # Setup Vectorizer
    # Note that min_df is confusing; see http://stackoverflow.com/a/27697863
    # min_df + an integer: if word found in less than n docs, then ignore
    vectorizer = CountVectorizer(min_df=2)  
    term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

    # Put BoW vectors into a new df
    dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())
    
    # Write BoW to disk
    # Just the Bag of Words, in case we want to use it by itself later
    dataframe_bow.to_csv(save_path, sep='|', encoding='utf-8')
    
    # Merge BoW df with the original feature table df
    # Important: Make sure the concat() function uses the original id index of the first, text datafram
    dataframe = pandas.concat([dataframe, dataframe_bow], axis=1, join_axes=[dataframe.index])
    
    return dataframe

In [23]:
dataframe_popular = make_merge_bow(dataframe_popular, 'feature_tables/popular_bow.csv')

# See the many new columns!
print(dataframe_popular.shape)  # (rows, columns)

(4679, 3871)


# Topic modeling

Think about how to put into feature table

# Write entire DataFrame to csv

The next notebook will pick up from here

In [24]:
dataframe_popular.to_csv('feature_tables/popular_all.csv', sep='|', encoding='utf-8')

# Do everything again for the unpopular tweets

In [25]:
# Ignore this code unless you run into MemoryError while running the unpopular tweets
# Releaese the old df for gc
import gc
import psutil

proc = psutil.Process(os.getpid())
gc.collect()
mem0 = proc.memory_info().rss

del dataframe_popular

gc.collect()
mem1 = proc.memory_info().rss

print('Before del and gc:', mem0/1000000, '(MB)')
print('After del and gc: ', mem1/1000000, '(MB)')
print('Difference:       ', (mem0 - mem1)/1000000, '(MB)')

Before del and gc: 244.195328 (MB)
After del and gc:  99.418112 (MB)
Difference:        144.777216 (MB)


In [26]:
def make_all_features_for_unpopular_tweets():
    """Do all the steps above to make the various feature tables.
    """
    dataframe_not_popular = csv_to_df('tweets/tweets_not_popular.csv')
    dataframe_not_popular.to_csv('feature_tables/not_popular_basics.csv', sep='|', encoding='utf-8')
    dataframe_not_popular = add_features_to_df(dataframe_not_popular)
    dataframe_not_popular = make_merge_bow(dataframe_not_popular, 'feature_tables/not_popular_bow.csv')

    print('Total (rows, columns):', dataframe_not_popular.shape)  # (rows, columns)

    dataframe_not_popular.to_csv('feature_tables/not_popular_all.csv', sep='|', encoding='utf-8')

In [27]:
make_all_features_for_unpopular_tweets()

Total (rows, columns): (18618, 9303)
