In [1]:
import csv
from datetime import datetime
import re

import pandas

# Open csv

In [2]:
# Open with csv module, iterate row-by-row
with open('tweets/tweets_popular.csv', 'rb') as file_open:
    popular_csv = csv.reader(file_open, delimiter='|')
    for row in popular_csv:
        pass

In [3]:
# Open with Pandas, load into DataFrame

# TODO: Parse dates correctly; this is close but not working
date_parser = lambda x: pandas.datetime.strptime(x, '%a %b %d %H:%M:%S +z %Y')  # Mon Feb 15 20:44:33 +0000 2016

popular_df = pandas.read_csv('tweets/tweets_popular.csv', 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False, 
                             parse_dates=True,
                             date_parser=date_parser
                            )

In [4]:
# Let's inspect our data
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (4571, 3))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime'], dtype='object'))


In [5]:
# Look at columns
print('Text column')
print(popular_df['text'][:4])

Text column
0    @CringeLMAO: Easy there m8 https://t.co/dnF3Wq...
1    @AustinMahone: Just posted a photo https://t.c...
2    @Ashton5SOS: Some days I drink way to much cof...
3    @lailamuhammad: When you nail that #Beyonc   m...
Name: text, dtype: object


In [6]:
# Look at columns
print('Retweet count')
print(popular_df['rt_count'][:4])

Retweet count
0     2084
1     1059
2    24121
3      801
Name: rt_count, dtype: int64


In [7]:
# Look at columns
print('Date-time')
print(popular_df['tweet_datetime'][:4])

Date-time
0    Mon Feb 15 20:44:33 +0000 2016
1    Mon Feb 15 20:44:33 +0000 2016
2    Mon Feb 15 20:44:33 +0000 2016
3    Mon Feb 15 20:44:33 +0000 2016
Name: tweet_datetime, dtype: object


In [8]:
# Let's look at the parsed date-time
# TODO: Try to parse this right
dt = popular_df['tweet_datetime'][0]
print(type(dt))

<type 'str'>


In [9]:
# Do the same for unpopular data
not_popular_df = pandas.read_csv('tweets/tweets_not_popular.csv', 
                                 delimiter='|', 
                                 error_bad_lines=False, 
                                 warn_bad_lines=False,
                                 parse_dates=True,
                                 date_parser=date_parser)

# Let's inspect our data
print('Shape:', not_popular_df.shape)
print('Columns:', not_popular_df.columns)

('Shape:', (18093, 3))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime'], dtype='object'))


# Data cleanup

## Remove duplicate rows

In [10]:
print('Shape before', popular_df.shape)
popular_df = popular_df.drop_duplicates()
print('Shape after', popular_df.shape)

('Shape before', (4571, 3))
('Shape after', (4400, 3))


In [11]:
print('Shape before', not_popular_df.shape)
popular_df = not_popular_df.drop_duplicates()
print('Shape after', not_popular_df.shape)

('Shape before', (18093, 3))
('Shape after', (18093, 3))


## Other cleanup?

# Feature extraction

## Word tokenization

Show plain function, maybe NLTK too

In [12]:
# A basic tokenizer

def tokenize_words(input_string):
    """Take a string, return a list of 
    strings broken on whitespace, but do 
    not break @mentions and URLs.
    """
    punctuation = [',', '!', '"', '. ', ': ']
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return [w for w in input_string.split(' ') if w]  # rm empty strings

In [13]:
a_tweet = "@taylorcaniff: Never mind I'm snowed in again I can't quit laughing"
tokenize_words(a_tweet)

['@taylorcaniff',
 'Never',
 'mind',
 "I'm",
 'snowed',
 'in',
 'again',
 'I',
 "can't",
 'quit',
 'laughing']

In [14]:
# See @users and http: not split
another_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(another_tweet)

['@CuteEmergency', "I'm", 'okay', 'https://t.co/TWMwjG03Fd']

In [15]:
# Using the Python re library 
def tokenize_words_regex(input_string):
    """Tokenize input string with re library,
    return list of strings."""
    tokenization_regex = re.compile(r"[\w']+|[.,!?;]")
    return tokenization_regex.findall(input_string)

In [16]:
tokenize_words_regex(another_tweet)

['CuteEmergency', "I'm", 'okay', '!', 'https', 't', '.', 'co', 'TWMwjG03Fd']

In [17]:
# NLTK has one, too but still breaks up what we need,
# we'll skip for this exercise
from nltk.tokenize.punkt import PunktLanguageVars

nltk_tokenizer = PunktLanguageVars()
nltk_tokenizer.word_tokenize(another_tweet)

['@',
 'CuteEmergency',
 ':',
 '"',
 'I',
 "'m",
 'okay',
 '!',
 '"',
 'https',
 ':',
 '//t.co/TWMwjG03Fd']

In [None]:
## Todo

case, space, some punctuation, etc

## Add column to datafram

In [18]:
# Add new column
# TODO do with .loc, not copy

tokens = []  # list of strings

for i, row in popular_df.iterrows():
    tokens.append(tokenize_words(row['text'])) 

popular_df['tokens'] = tokens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (17682, 4))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime', u'tokens'], dtype='object'))


In [20]:
popular_df.columns

Index([u'text', u'rt_count', u'tweet_datetime', u'tokens'], dtype='object')

In [21]:
# Repeat for not_popular

tokens = []  # list of strings

for i, row in not_popular_df.iterrows():
    tokens.append(tokenize_words(row['text'])) 

not_popular_df['tokens'] = tokens

## Named entity recognition (NER)

Show NLTK code, not for feature table.

## Counting text

* count chars
* count words
* links
* count links
* #hashtags
* count #hashtags
* @mentions
* count @mentions

In [22]:
def get_urls(input_tokens):
    """Check incoming list of strings, check if token
    starts with `http(s)://`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('http')]`
    """
    urls = []
    for word in input_tokens:
        if word.startswith('http'):
            urls.append(word)
    return urls

In [23]:
def get_hashtags(input_tokens):
    """Check incoming list of strings, check if token
    starts with `#`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('#')]`
    """
    hashtags = []
    for word in input_tokens:
        if word.startswith('#'):
            hashtags.append(word)
    return hashtags

In [24]:
def get_mentions(input_tokens):
    """Check incoming list of strings, check if token
    starts with `@`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('@')]`
    """
    mentions = []
    for word in input_tokens:
        if word.startswith('@'):
            mentions.append(word)
    return mentions

In [25]:
# Add new column
# TODO do with .loc, not copy

char_count = []
word_count = []
urls = []
hashtags = []
mentions = []

for i, row in popular_df.iterrows():
    # Text and tokens
    char_count.append(len(row['text']))
    word_count.append(len(row['tokens']))
    
    # URLs
    url_list = get_urls(row['tokens'])
    urls.append(url_list)
    url_count = len(url_list)
    
    # Hashtags
    hashtag_list = get_hashtags(row['tokens'])
    hashtags.append(hashtag_list)
    hashtag_count = len(hashtag_list)
    
    # Mentions
    mentions_list = get_mentions(row['tokens'])
    mentions.append(mentions_list)
    mentions_count = len(mentions_list)


popular_df['char_count'] = char_count
popular_df['word_count'] = word_count
popular_df['urls'] = urls
popular_df['url_count'] = url_count
popular_df['hashtags'] = hashtags
popular_df['hashtag_count'] = hashtag_count
popular_df['mentions'] = hashtags
popular_df['mentions_count'] = mentions_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [26]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)
print(popular_df.columns)

('Shape:', (17682, 12))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object'))
Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object')


In [27]:
# Add new column
# TODO do with .loc, not copy

char_count = []
word_count = []
urls = []
hashtags = []
mentions = []

for i, row in not_popular_df.iterrows():
    # Text and tokens
    char_count.append(len(row['text']))
    word_count.append(len(row['tokens']))
    
    # URLs
    url_list = get_urls(row['tokens'])
    urls.append(url_list)
    url_count = len(url_list)
    
    # Hashtags
    hashtag_list = get_hashtags(row['tokens'])
    hashtags.append(hashtag_list)
    hashtag_count = len(hashtag_list)
    
    # Mentions
    mentions_list = get_mentions(row['tokens'])
    mentions.append(mentions_list)
    mentions_count = len(mentions_list)


not_popular_df['char_count'] = char_count
not_popular_df['word_count'] = word_count
not_popular_df['urls'] = urls
not_popular_df['url_count'] = url_count
not_popular_df['hashtags'] = hashtags
not_popular_df['hashtag_count'] = hashtag_count
not_popular_df['mentions'] = hashtags
not_popular_df['mentions_count'] = mentions_count

In [28]:
print('Shape:', not_popular_df.shape)
print('Columns:', not_popular_df.columns)
print(not_popular_df.columns)

('Shape:', (18093, 12))
('Columns:', Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object'))
Index([u'text', u'rt_count', u'tweet_datetime', u'tokens', u'char_count',
       u'word_count', u'urls', u'url_count', u'hashtags', u'hashtag_count',
       u'mentions', u'mentions_count'],
      dtype='object')


## TODO: Extract from datetime

Our times are not very diverse, so maybe not useful to the model

# Bag of words

This would be useful for their speech classifying exercise

<http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation>

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
popular_text_list = popular_df['text'].tolist()

In [31]:
popular_vectorizer = CountVectorizer(min_df=1)
popular_X = popular_vectorizer.fit_transform(popular_text_list)  # input is a list of strings, 1 per document
popular_analyze = popular_vectorizer.build_analyzer()

In [32]:
popular_vectorizer.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'00005000',
 u'004917641659029',
 u'00655',
 u'00am',
 u'00dd7yaxc5',
 u'00jywytzde',
 u'00m',
 u'00pm',
 u'00s',
 u'00te',
 u'00xsjjbhzb',
 u'01',
 u'011st1cbzg',
 u'01pm',
 u'01uelpmonr',
 u'01zapmvxzj',
 u'02',
 u'0209ahn',
 u'021xb0unig',
 u'02am',
 u'03',
 u'0300',
 u'034',
 u'038fyag08c',
 u'03oqllx4uy',
 u'03pm',
 u'03xijsddlm',
 u'04',
 u'04c1h7zobn',
 u'04k474lpa2',
 u'04pm',
 u'04pq2bbcww',
 u'05',
 u'0531elizabeth',
 u'058_powermoves',
 u'06',
 u'06dqbtkidm',
 u'06pbnrtycd',
 u'06s3lwobxt',
 u'06vdp3g3i7',
 u'07',
 u'0720shan',
 u'0731please',
 u'079',
 u'07dbh0lqxe',
 u'07lkntjyso',
 u'08',
 u'0811',
 u'082',
 u'0843',
 u'0892',
 u'08bydynpky',
 u'08h2',
 u'08vzuesa84',
 u'09',
 u'094',
 u'0992',
 u'09hhmblpyt',
 u'0agt0fiyz8',
 u'0am1a4gzxw',
 u'0aoqfaidom',
 u'0b0ljbc38b',
 u'0bca5peaza',
 u'0cl8pcjhiq',
 u'0cljhyamde',
 u'0e3ulqxfpp',
 u'0e4ozqvwql',
 u'0efai3ls1m',
 u'0euznfnidi',
 u'0ezyrtf5pt',
 u'0f0t3ixymb',
 u'0fbmzat00t',
 u'0fdij3ssy0

In [33]:
len(popular_vectorizer.get_feature_names())

39880

In [34]:
# then, figure out how to add each of these sparse vectors to the dataframe

In [35]:
print(popular_X)

  (0, 32399)	1
  (0, 32612)	1
  (0, 25111)	1
  (0, 25247)	1
  (0, 29129)	1
  (1, 12089)	1
  (1, 29274)	1
  (1, 28429)	1
  (1, 33195)	1
  (1, 35351)	1
  (1, 8485)	1
  (1, 17723)	1
  (1, 37587)	1
  (1, 3378)	1
  (1, 13717)	1
  (1, 3671)	1
  (1, 27405)	1
  (1, 34562)	1
  (1, 39323)	1
  (1, 29056)	1
  (1, 26617)	1
  (1, 37686)	1
  (1, 25703)	1
  (1, 16605)	1
  (2, 18275)	1
  :	:
  (17679, 20622)	1
  (17679, 9285)	1
  (17679, 14643)	1
  (17679, 21989)	1
  (17679, 13377)	1
  (17680, 34562)	1
  (17680, 16605)	1
  (17680, 8326)	1
  (17680, 25928)	1
  (17680, 26604)	1
  (17680, 37020)	1
  (17680, 5082)	1
  (17680, 20617)	1
  (17680, 29603)	1
  (17680, 38417)	1
  (17680, 32132)	1
  (17680, 28658)	1
  (17680, 28670)	1
  (17680, 39296)	1
  (17680, 25702)	1
  (17681, 17834)	1
  (17681, 37687)	1
  (17681, 22834)	1
  (17681, 38244)	1
  (17681, 7003)	1


In [36]:
popular_bow_array = popular_vectorizer.fit_transform(popular_text_list).toarray()  # is this what we want?

In [37]:
popular_vectorizer.transform(['and an example tweet']).toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
popular_bow_array.shape  # first value is the # of rows (ie, tweets) in the DataFrame; second is # of vocab words

(17682L, 39880L)

In [39]:
popular_bow_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
# How to add this vector as a vector?
# popular_vectorizer.transform(['and an example tweet']).toarray()

#! Read this: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

#bow_vector = popular_vectorizer.transform([row['text']]).toarray()
#bow_vectors.append(bow_vector)
#popular_df['bow_vector'] = bow_vector

In [41]:
#! an alternative is to ex

# Topic modeling

Think about how to put into feature table

# Write DataFrame to csv

The next notebook will pick up from here

In [42]:
popular_df.to_csv('feature_tables/popular_tweet_features.csv', sep='|', encoding='utf-8')

In [43]:
not_popular_df.to_csv('feature_tables/not_popular_tweet_features.csv', sep='|', encoding='utf-8')