In [1]:
import csv
from datetime import datetime
import re

import pandas

# Open csv

In [2]:
# Open with csv module, iterate row-by-row
with open('tweets/tweets_popular.csv', 'rb') as file_open:
    popular_csv = csv.reader(file_open, delimiter='|')
    for row in popular_csv:
        pass

In [3]:
# Open with Pandas, load into DataFrame

# TODO: Parse dates correctly; this is close but not working
date_parser = lambda x: pandas.datetime.strptime(x, '%a %b %d %H:%M:%S +z %Y')  # Mon Feb 15 20:44:33 +0000 2016

popular_df = pandas.read_csv('tweets/tweets_popular.csv', 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False, 
                             parse_dates=True,
                             date_parser=date_parser
                            )

In [4]:
# Let's inspect our data
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (3028, 3))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime'], dtype='object'))


In [5]:
# Look at columns
print('Text column')
print(popular_df['text'][:4])

Text column
0    @CringeLMAO: Easy there m8 https://t.co/dnF3Wq...
1    @AustinMahone: Just posted a photo https://t.c...
2    @Ashton5SOS: Some days I drink way to much cof...
3    @lailamuhammad: When you nail that #Beyonc   m...
Name: text, dtype: object


In [6]:
# Look at columns
print('Retweet count')
print(popular_df['rt_count'][:4])

Retweet count
0     2084
1     1059
2    24121
3      801
Name: rt_count, dtype: int64


In [7]:
# Look at columns
print('Date-time')
print(popular_df['tweet_datetime'][:4])

Date-time
0    Mon Feb 15 20:44:33 +0000 2016
1    Mon Feb 15 20:44:33 +0000 2016
2    Mon Feb 15 20:44:33 +0000 2016
3    Mon Feb 15 20:44:33 +0000 2016
Name: tweet_datetime, dtype: object


In [8]:
# Let's look at the parsed date-time
# TODO: Try to parse this right
dt = popular_df['tweet_datetime'][0]
print(type(dt))

<type 'str'>


In [9]:
# Do the same for unpopular data
not_popular_df = pandas.read_csv('tweets/tweets_not_popular.csv', 
                                 delimiter='|', 
                                 error_bad_lines=False, 
                                 warn_bad_lines=False,
                                 parse_dates=True,
                                 date_parser=date_parser)

# Let's inspect our data
print('Shape:', not_popular_df.shape)
print('Columns:', not_popular_df.columns)

('Shape:', (11328, 3))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime'], dtype='object'))


# Data cleanup

## Remove duplicate rows

In [10]:
print('Shape before', popular_df.shape)
popular_df = popular_df.drop_duplicates()
print('Shape after', popular_df.shape)

('Shape before', (3028, 3))
('Shape after', (2919, 3))


In [11]:
print('Shape before', not_popular_df.shape)
popular_df = not_popular_df.drop_duplicates()
print('Shape after', not_popular_df.shape)

('Shape before', (11328, 3))
('Shape after', (11328, 3))


## Other cleanup?

# Feature extraction

## Word tokenization

Show plain function, maybe NLTK too

In [12]:
# A basic tokenizer

def tokenize_words(input_string):
    """Take a string, return a list of 
    strings broken on whitespace, but do 
    not break @mentions and URLs.
    """
    punctuation = [',', '!', '"', '. ', ': ']
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return [w for w in input_string.split(' ') if w]  # rm empty strings

In [13]:
a_tweet = "@taylorcaniff: Never mind I'm snowed in again I can't quit laughing"
tokenize_words(a_tweet)

['@taylorcaniff',
 'Never',
 'mind',
 "I'm",
 'snowed',
 'in',
 'again',
 'I',
 "can't",
 'quit',
 'laughing']

In [14]:
# See @users and http: not split
another_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(another_tweet)

['@CuteEmergency', "I'm", 'okay', 'https://t.co/TWMwjG03Fd']

In [15]:
# Using the Python re library 
def tokenize_words_regex(input_string):
    """Tokenize input string with re library,
    return list of strings."""
    tokenization_regex = re.compile(r"[\w']+|[.,!?;]")
    return tokenization_regex.findall(input_string)

In [16]:
tokenize_words_regex(another_tweet)

['CuteEmergency', "I'm", 'okay', '!', 'https', 't', '.', 'co', 'TWMwjG03Fd']

In [17]:
# NLTK has one, too but still breaks up what we need,
# we'll skip for this exercise
from nltk.tokenize.punkt import PunktLanguageVars

nltk_tokenizer = PunktLanguageVars()
nltk_tokenizer.word_tokenize(another_tweet)

['@',
 'CuteEmergency',
 ':',
 '"',
 'I',
 "'m",
 'okay',
 '!',
 '"',
 'https',
 ':',
 '//t.co/TWMwjG03Fd']

## Pulling hashtags

In [18]:
# Add new column
# TODO do with .loc, not copy

tokens = []  # list of strings

for i, row in popular_df.iterrows():
    tokens.append(tokenize_words(row['text'])) 

popular_df['tokens'] = tokens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (11113, 4))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime', u'tokens'], dtype='object'))


In [20]:
popular_df.columns

Index([u'text', u'rt_count', u'tweet_datetime', u'tokens'], dtype='object')

## Named entity recognition (NER)

Show NLTK code, not for feature table.

## Counting text

* count chars
* count words
* links
* count links
* #hashtags
* count #hashtags
* @mentions
* count @mentions

In [21]:
def get_urls(input_tokens):
    """Check incoming list of strings, check if token
    starts with `http(s)://`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('http')]`
    """
    urls = []
    for word in input_tokens:
        if word.startswith('http'):
            urls.append(word)
    return urls

In [22]:
def get_hashtags(input_tokens):
    """Check incoming list of strings, check if token
    starts with `#`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('#')]`
    """
    hashtags = []
    for word in input_tokens:
        if word.startswith('#'):
            hashtags.append(word)
    return hashtags

In [23]:
def get_mentions(input_tokens):
    """Check incoming list of strings, check if token
    starts with `@`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('@')]`
    """
    mentions = []
    for word in input_tokens:
        if word.startswith('@'):
            mentions.append(word)
    return mentions

In [24]:
# Add new column
# TODO do with .loc, not copy

char_count = []
word_count = []
urls = []
hashtags = []
mentions = []

for i, row in popular_df.iterrows():
    # Text and tokens
    char_count.append(len(row['text']))
    word_count.append(len(row['tokens']))
    
    # URLs
    url_list = get_urls(row['tokens'])
    urls.append(url_list)
    url_count = len(url_list)
    
    # Hashtags
    hashtag_list = get_hashtags(row['tokens'])
    hashtags.append(hashtag_list)
    hashtag_count = len(hashtag_list)
    
    # Mentions
    mentions_list = get_mentions(row['tokens'])
    mentions.append(mentions_list)
    mentions_count = len(mentions_list)


popular_df['char_count'] = char_count
popular_df['word_count'] = word_count
popular_df['urls'] = urls
popular_df['url_count'] = url_count
popular_df['hashtags'] = hashtags
popular_df['hashtag_count'] = hashtag_count
popular_df['mentions'] = hashtags
popular_df['mentions_count'] = mentions_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [25]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)
print(popular_df.columns)

('Shape:', (11113, 12))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object'))
Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object')


In [26]:
#for i, row in popular_df.iterrows():
#    print(row)
#    input()

## Extracting from datetime?

Problem here is our times are not very diverse.

# Bag of words?

This would be useful for their speech classifying exercise

<http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation>

# Topic modeling

Think about how to put into feature table

# Write DataFrame to csv

The next notebook will pick up from here

In [27]:
popular_df.to_csv('feature_tables/popular_tweet_features.csv', sep='|', encoding='utf-8')

In [28]:
not_popular_df.to_csv('feature_tables/not_popular_tweet_features.csv', sep='|', encoding='utf-8')