In [1]:
import csv
from datetime import datetime
import re

import pandas
from sklearn.feature_extraction.text import CountVectorizer

# Open csv

In [2]:
# Open with csv module, iterate row-by-row
with open('tweets/tweets_popular.csv', 'rb') as file_open:
    popular_csv = csv.reader(file_open, delimiter='|')
    for row in popular_csv:
        pass

In [3]:
# Open with Pandas, load into DataFrame

# TODO: Parse dates correctly; this is close but not working
date_parser = lambda x: pandas.datetime.strptime(x, '%a %b %d %H:%M:%S +z %Y')  # Mon Feb 15 20:44:33 +0000 2016

popular_df = pandas.read_csv('tweets/tweets_popular.csv', 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False, 
                             parse_dates=True,
                             date_parser=date_parser
                            )

In [4]:
# Let's inspect our data
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (4679, 3))
('Columns:', Index([u'_text', u'_rt_count', u'_tweet_datetime'], dtype='object'))


In [5]:
# Look at columns
print('Text column')
print(popular_df['_text'][:4])  # the underscores will be apparent later

Text column
0    @CringeLMAO: Easy there m8 https://t.co/dnF3Wq...
1    @AustinMahone: Just posted a photo https://t.c...
2    @Ashton5SOS: Some days I drink way to much cof...
3    @lailamuhammad: When you nail that #Beyonc   m...
Name: _text, dtype: object


In [6]:
# Look at columns
print('Retweet count')
print(popular_df['_rt_count'][:4])

Retweet count
0     2084
1     1059
2    24121
3      801
Name: _rt_count, dtype: int64


In [7]:
# Look at columns
print('Date-time')
print(popular_df['_tweet_datetime'][:4])

Date-time
0    Mon Feb 15 20:44:33 +0000 2016
1    Mon Feb 15 20:44:33 +0000 2016
2    Mon Feb 15 20:44:33 +0000 2016
3    Mon Feb 15 20:44:33 +0000 2016
Name: _tweet_datetime, dtype: object


In [8]:
# Let's look at the parsed date-time
# TODO: Try to parse this right
dt = popular_df['_tweet_datetime'][0]
print(type(dt))

<type 'str'>


In [9]:
# Do the same for unpopular data
not_popular_df = pandas.read_csv('tweets/tweets_not_popular.csv', 
                                 delimiter='|', 
                                 error_bad_lines=False, 
                                 warn_bad_lines=False,
                                 parse_dates=True,
                                 date_parser=date_parser)

# Let's inspect our data
print('Shape:', not_popular_df.shape)
print('Columns:', not_popular_df.columns)

('Shape:', (18618, 3))
('Columns:', Index([u'_text', u'_rt_count', u'_tweet_datetime'], dtype='object'))


# Data cleanup

## Remove duplicate rows

In [10]:
print('Shape before', popular_df.shape)
popular_df = popular_df.drop_duplicates()
print('Shape after', popular_df.shape)

('Shape before', (4679, 3))
('Shape after', (4506, 3))


In [11]:
print('Shape before', not_popular_df.shape)
not_popular_df = not_popular_df.drop_duplicates()
print('Shape after', not_popular_df.shape)

('Shape before', (18618, 3))
('Shape after', (18200, 3))


## Other cleanup?

# Feature extraction

## Word tokenization

Show plain function, maybe NLTK too

In [12]:
# A basic tokenizer

def tokenize_words(input_string):
    """Take a string, return a list of 
    strings broken on whitespace, but do 
    not break @mentions and URLs.
    """
    punctuation = [',', '!', '"', '. ', ': ']
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return [w for w in input_string.split(' ') if w]  # rm empty strings

In [13]:
a_tweet = "@taylorcaniff: Never mind I'm snowed in again I can't quit laughing"
tokenize_words(a_tweet)

['@taylorcaniff',
 'Never',
 'mind',
 "I'm",
 'snowed',
 'in',
 'again',
 'I',
 "can't",
 'quit',
 'laughing']

In [14]:
# See @users and http: not split
another_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(another_tweet)

['@CuteEmergency', "I'm", 'okay', 'https://t.co/TWMwjG03Fd']

In [15]:
# Using the Python re library 
def tokenize_words_regex(input_string):
    """Tokenize input string with re library,
    return list of strings."""
    tokenization_regex = re.compile(r"[\w']+|[.,!?;]")
    return tokenization_regex.findall(input_string)

In [16]:
tokenize_words_regex(another_tweet)

['CuteEmergency', "I'm", 'okay', '!', 'https', 't', '.', 'co', 'TWMwjG03Fd']

In [17]:
# NLTK has one, too but still breaks up what we need,
# we'll skip for this exercise
from nltk.tokenize.punkt import PunktLanguageVars

nltk_tokenizer = PunktLanguageVars()
nltk_tokenizer.word_tokenize(another_tweet)

['@',
 'CuteEmergency',
 ':',
 '"',
 'I',
 "'m",
 'okay',
 '!',
 '"',
 'https',
 ':',
 '//t.co/TWMwjG03Fd']

In [18]:
## Todo

# case, space, some punctuation, etc

## Add column to datafram

In [19]:
# Add new column
# TODO do with .loc, not copy
# OR do like in kaggle docs: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
# ... look for 'num_reviews = train["review"].size'

tokens = []  # list of strings

for i, row in popular_df.iterrows():
    tokens.append(tokenize_words(row['_text'])) 

popular_df['_tokens'] = tokens

In [20]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)

('Shape:', (4506, 4))
('Columns:', Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens'], dtype='object'))


In [21]:
popular_df.columns

Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens'], dtype='object')

In [22]:
# Repeat for not_popular

tokens = []  # list of strings

for i, row in not_popular_df.iterrows():
    tokens.append(tokenize_words(row['_text'])) 

not_popular_df['_tokens'] = tokens

## Named entity recognition (NER)

Show NLTK code, not for feature table.

## Counting text

* count chars
* count words
* links
* count links
* #hashtags
* count #hashtags
* @mentions
* count @mentions

In [23]:
def get_urls(input_tokens):
    """Check incoming list of strings, check if token
    starts with `http(s)://`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('http')]`
    """
    urls = []
    for word in input_tokens:
        if word.startswith('http'):
            urls.append(word)
    return urls

In [24]:
def get_hashtags(input_tokens):
    """Check incoming list of strings, check if token
    starts with `#`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('#')]`
    """
    hashtags = []
    for word in input_tokens:
        if word.startswith('#'):
            hashtags.append(word)
    return hashtags

In [25]:
def get_mentions(input_tokens):
    """Check incoming list of strings, check if token
    starts with `@`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('@')]`
    """
    mentions = []
    for word in input_tokens:
        if word.startswith('@'):
            mentions.append(word)
    return mentions

In [26]:
# Add new column
# TODO do with .loc, not copy

char_count = []
word_count = []
urls = []
hashtags = []
mentions = []

for i, row in popular_df.iterrows():
    # Text and tokens
    char_count.append(len(row['_text']))
    word_count.append(len(row['_tokens']))
    
    # URLs
    url_list = get_urls(row['_tokens'])
    urls.append(url_list)
    url_count = len(url_list)
    
    # Hashtags
    hashtag_list = get_hashtags(row['_tokens'])
    hashtags.append(hashtag_list)
    hashtag_count = len(hashtag_list)
    
    # Mentions
    mentions_list = get_mentions(row['_tokens'])
    mentions.append(mentions_list)
    mentions_count = len(mentions_list)


popular_df['_char_count'] = char_count
popular_df['_word_count'] = word_count
popular_df['_urls'] = urls
popular_df['_url_count'] = url_count
popular_df['_hashtags'] = hashtags
popular_df['_hashtag_count'] = hashtag_count
popular_df['_mentions'] = hashtags
popular_df['_mentions_count'] = mentions_count

In [27]:
print('Shape:', popular_df.shape)
print('Columns:', popular_df.columns)
print(popular_df.columns)

('Shape:', (4506, 12))
('Columns:', Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object'))
Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object')


In [28]:
# Add new column
# TODO do with .loc, not copy

char_count = []
word_count = []
urls = []
hashtags = []
mentions = []

for i, row in not_popular_df.iterrows():
    # Text and tokens
    char_count.append(len(row['_text']))
    word_count.append(len(row['_tokens']))
    
    # URLs
    url_list = get_urls(row['_tokens'])
    urls.append(url_list)
    url_count = len(url_list)
    
    # Hashtags
    hashtag_list = get_hashtags(row['_tokens'])
    hashtags.append(hashtag_list)
    hashtag_count = len(hashtag_list)
    
    # Mentions
    mentions_list = get_mentions(row['_tokens'])
    mentions.append(mentions_list)
    mentions_count = len(mentions_list)


not_popular_df['_char_count'] = char_count
not_popular_df['_word_count'] = word_count
not_popular_df['_urls'] = urls
not_popular_df['_url_count'] = url_count
not_popular_df['_hashtags'] = hashtags
not_popular_df['_hashtag_count'] = hashtag_count
not_popular_df['_mentions'] = hashtags
not_popular_df['_mentions_count'] = mentions_count

In [29]:
print('Shape:', not_popular_df.shape)
print('Columns:', not_popular_df.columns)
print(not_popular_df.columns)

('Shape:', (18200, 12))
('Columns:', Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object'))
Index([u'_text', u'_rt_count', u'_tweet_datetime', u'_tokens', u'_char_count',
       u'_word_count', u'_urls', u'_url_count', u'_hashtags',
       u'_hashtag_count', u'_mentions', u'_mentions_count'],
      dtype='object')


## TODO: Extract from datetime

Our times are not very diverse, so maybe not useful to the model

# Bag of words

This would be useful for their speech classifying exercise

<http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation>

In [30]:
# get list of strings, for input into vectorizer
popular_text_list = popular_df['_text'].tolist()
print(popular_text_list[:5])

['@CringeLMAO: Easy there m8 https://t.co/dnF3Wqdt1C', '@AustinMahone: Just posted a photo https://t.co/hXFg6TyuzE', "@Ashton5SOS: Some days I drink way to much coffee and fill your Twitter feeds with stupid replies and pointless videos, I ain't sorry ok", '@lailamuhammad: When you nail that #Beyonc   move #slay Directed by @MarqCotton @BuckeyeBond83 The reason behind it Tues at 11 @WTKR3 htt   ', '@BDBANDS: MOOD         https://t.co/NMlFBJZtic']


In [31]:
vectorizer_popular = CountVectorizer(min_df=1)  # TODO: Revisit the options for this, lowercase and tokenizer
term_doc_matrix_popular = vectorizer_popular.fit_transform(popular_text_list)  # input is a list of strings, 1 per document
#analyze_popular = vectorizer_popular.build_analyzer()  # Not needed, useful for future unseen input strings

In [32]:
print(vectorizer_popular.get_feature_names()[1000:1010])  # these are all the unique word tokens in all the tweets
print(term_doc_matrix_popular.toarray()[1000:1010])

[u'attracted', u'attractive', u'atu3ogi58j', u'au3c0efdt6', u'aubcynn626', u'aucqtimdxo', u'audacity', u'audience', u'audiomack', u'aunnazu0uv']
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [33]:
len(vectorizer_popular.get_feature_names())  # total unique words in all tweets

9201

In [34]:
# Put BoW vectors into a new df
df_bow_popular = pandas.DataFrame(term_doc_matrix_popular.toarray(), columns=vectorizer_popular.get_feature_names())
print(df_bow_popular)

      000  00am  017wk6iawj  01f0x5b7y9  02  03yr4oazqx  04  05  067xnxlmwc  \
0       0     0           0           0   0           0   0   0           0   
1       0     0           0           0   0           0   0   0           0   
2       0     0           0           0   0           0   0   0           0   
3       0     0           0           0   0           0   0   0           0   
4       0     0           0           0   0           0   0   0           0   
5       0     0           0           0   0           0   0   0           0   
6       0     0           0           0   0           0   0   0           0   
7       0     0           0           0   0           0   0   0           0   
8       0     0           0           0   0           0   0   0           0   
9       0     0           0           0   0           0   0   0           0   
10      0     0           0           0   0           0   0   0           0   
11      0     0           0           0   0         

In [35]:
# Merge BoW df with the original feature table df
# Important: Make sure the concat() function uses the original id index of the first, text datafram
df_popular_features_bow = pandas.concat([popular_df, df_bow_popular], axis=1, join_axes=[popular_df.index])

In [36]:
print(df_popular_features_bow.shape)  # (rows, columns)

(4506, 9213)


In [37]:
df_popular_features_bow

Unnamed: 0,_text,_rt_count,_tweet_datetime,_tokens,_char_count,_word_count,_urls,_url_count,_hashtags,_hashtag_count,...,zvylyv0nmv,zwmjvgsrpv,zwxizpvdu2,zxchburns,zxr8djdkbj,zxs67qkyyo,zyfny,zyw07b8q2s,zzeaagygjc,zzv975jgqa
0,@CringeLMAO: Easy there m8 https://t.co/dnF3Wq...,2084,Mon Feb 15 20:44:33 +0000 2016,"[@CringeLMAO, Easy, there, m8, https://t.co/dn...",50,5,[https://t.co/dnF3Wqdt1C],1,[],0,...,0,0,0,0,0,0,0,0,0,0
1,@AustinMahone: Just posted a photo https://t.c...,1059,Mon Feb 15 20:44:33 +0000 2016,"[@AustinMahone, Just, posted, a, photo, https:...",58,6,[https://t.co/hXFg6TyuzE],1,[],0,...,0,0,0,0,0,0,0,0,0,0
2,@Ashton5SOS: Some days I drink way to much cof...,24121,Mon Feb 15 20:44:33 +0000 2016,"[@Ashton5SOS, Some, days, I, drink, way, to, m...",136,24,[],1,[],0,...,0,0,0,0,0,0,0,0,0,0
3,@lailamuhammad: When you nail that #Beyonc m...,801,Mon Feb 15 20:44:33 +0000 2016,"[@lailamuhammad, When, you, nail, that, #Beyon...",140,21,[],1,"[#Beyonc, #slay]",0,...,0,0,0,0,0,0,0,0,0,0
4,@BDBANDS: MOOD https://t.co/NMlFBJZtic,1856,Mon Feb 15 20:44:33 +0000 2016,"[@BDBANDS, MOOD, https://t.co/NMlFBJZtic]",46,3,[https://t.co/NMlFBJZtic],1,[],0,...,0,0,0,0,0,0,0,0,0,0
5,@TheGRAMMYs: Congrats Best Pop Vocal Album @ta...,3747,Mon Feb 15 20:44:33 +0000 2016,"[@TheGRAMMYs, Congrats, Best, Pop, Vocal, Albu...",99,11,[https://t.co/6gqbPR2JmW],1,[#GRAMMYs],0,...,0,0,0,0,0,0,0,0,0,0
6,@taylorcaniff: Never mind I'm snowed in again ...,1961,Mon Feb 15 20:44:33 +0000 2016,"[@taylorcaniff, Never, mind, I'm, snowed, in, ...",67,11,[],1,[],0,...,0,0,0,0,0,0,0,0,0,0
7,@Ashton5SOS: But this is the obvious reason I ...,21948,Mon Feb 15 20:44:33 +0000 2016,"[@Ashton5SOS, But, this, is, the, obvious, rea...",102,19,[],1,[],0,...,0,0,0,0,0,0,0,0,0,0
9,@FemaleTexts: February 15th?? You mean annoy s...,2426,Mon Feb 15 20:44:33 +0000 2016,"[@FemaleTexts, February, 15th??, You, mean, an...",83,9,[https://t.co/k06HAUNeoM],1,[],0,...,0,0,0,0,0,0,0,0,0,0
10,@AustinMahone: Maybe I'll make a country song ...,1826,Mon Feb 15 20:44:33 +0000 2016,"[@AustinMahone, Maybe, I'll, make, a, country,...",51,8,[],1,[],0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Repeat above with not popular tweets

# get list of strings, for input into vectorizer
not_popular_text_list = not_popular_df['_text'].tolist()
vectorizer_not_popular = CountVectorizer(min_df=1, max_df=2)  # TODO: Revisit the options for this, lowercase and tokenizer
term_doc_matrix_not_popular = vectorizer_not_popular.fit_transform(not_popular_text_list)  # input is a list of strings, 1 per document
#analyze_not_popular = vectorizer_not_popular.build_analyzer()  # Not needed, useful for future unseen input strings
print(len(vectorizer_not_popular.get_feature_names()))  # total unique words in all tweets

34903


In [39]:
# Put BoW vectors into a new df
df_bow_not_popular = pandas.DataFrame(term_doc_matrix_not_popular.toarray(), columns=vectorizer_not_popular.get_feature_names())
df_not_popular_features_bow = pandas.concat([not_popular_df, df_bow_not_popular], axis=1, join_axes=[not_popular_df.index])
print(df_not_popular_features_bow.shape)  # (rows, columns)

MemoryError: 

# Topic modeling

Think about how to put into feature table

# Write DataFrame to csv

The next notebook will pick up from here

In [None]:
df_popular_features_bow.to_csv('feature_tables/popular_tweet_features.csv', sep='|', encoding='utf-8')

In [None]:
df_not_popular_features_bow.to_csv('feature_tables/not_popular_tweet_features.csv', sep='|', encoding='utf-8')