# CS579: Lecture 13  

**Demographic Inference II**

*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)*  
*[Illinois Institute of Technology](http://iit.edu)*

## Gender Classification

Let's build a classifier to predict whether a Twitter user is male/female.

We'll collect "labeled" training data using Census name list.

**1.) Collect Census names. **

In [2]:
# Fetch male/female names from Census.

import requests

def get_census_names():
    """ Fetch a list of common male/female names from the census.
    For ambiguous names, we select the more frequent gender."""
    males = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.male.first').text.split('\n')
    females = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first').text.split('\n')
    males_pct = dict([(m.split()[0].lower(), float(m.split()[1]))
                  for m in males if m])
    females_pct = dict([(f.split()[0].lower(), float(f.split()[1]))
                    for f in females if f])
    male_names = set([m for m in males_pct if m not in females_pct or
                  males_pct[m] > females_pct[m]])
    female_names = set([f for f in females_pct if f not in males_pct or
                  females_pct[f] > males_pct[f]])    
    return male_names, female_names

male_names, female_names = get_census_names()
print('found %d female and %d male names' % (len(male_names), len(female_names)))
print 'male name sample:', list(male_names)[:5]
print 'female name sample:', list(female_names)[:5]

found 1146 female and 4014 male names
male name sample: [u'trenton', u'darrin', u'emile', u'jason', u'ron']
female name sample: [u'fawn', u'kymberly', u'augustina', u'evalyn', u'chieko']


**2.) Sample 5K tweets with names on the Census list. **

In [4]:
# Construct TwitterAPI object.

import ConfigParser
from TwitterAPI import TwitterAPI

def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')

In [5]:
# Sample U.S. tweets with names from Census. 
import sys

def get_first_name(tweet):
    if 'user' in tweet and 'name' in tweet['user']:
        parts = tweet['user']['name'].split()
        if len(parts) > 0:
            return parts[0].lower()

def sample_tweets(twitter, limit, male_names, female_names):
    tweets = []
    while True:
        try:
            # Restrict to U.S.
            for response in twitter.request('statuses/filter',
                        {'locations':'-124.637,24.548,-66.993,48.9974'}):
                if 'user' in response:
                    name = get_first_name(response)
                    if name in male_names or name in female_names:
                        tweets.append(response)
                        if len(tweets) % 100 == 0:
                            print 'found %d tweets' % len(tweets)
                        if len(tweets) >= limit:
                            return tweets
        except:
            print "Unexpected error:", sys.exc_info()[0]
    return tweets
        
tweets = sample_tweets(twitter, 5000, male_names, female_names)

found 100 tweets
found 200 tweets
found 300 tweets
found 400 tweets
found 500 tweets
found 600 tweets
found 700 tweets
found 800 tweets
found 900 tweets
found 1000 tweets
found 1100 tweets
found 1200 tweets
found 1300 tweets
found 1400 tweets
found 1500 tweets
found 1600 tweets
found 1700 tweets
found 1800 tweets
found 1900 tweets
found 2000 tweets
found 2100 tweets
found 2200 tweets
found 2300 tweets
found 2400 tweets
found 2500 tweets
found 2600 tweets
found 2700 tweets
found 2800 tweets
found 2900 tweets
found 3000 tweets
found 3100 tweets
found 3200 tweets
found 3300 tweets
found 3400 tweets
found 3500 tweets
found 3600 tweets
found 3700 tweets
found 3800 tweets
found 3900 tweets
found 4000 tweets
found 4100 tweets
found 4200 tweets
found 4300 tweets
found 4400 tweets
found 4500 tweets
found 4600 tweets
found 4700 tweets
found 4800 tweets
found 4900 tweets
found 5000 tweets


In [138]:
from collections import Counter

print 'sampled %d tweets' % len(tweets)
print 'top names:', Counter(get_first_name(t) for t in tweets).most_common(10)

sampled 5000 tweets
top names: [(u'john', 62), (u'emily', 55), (u'berenice', 47), (u'michael', 46), (u'david', 42), (u'nick', 42), (u'joe', 40), (u'chris', 40), (u'mike', 38), (u'jason', 32)]


In [139]:
# Save these tweets.
import pickle
pickle.dump(tweets, open('tweets.pkl', 'wb'))

**3.) Tokenize tweets. **

In [140]:
test_tweet = tweets[200]
print('test tweet:\n\tscreen_name=%s\n\tname=%s\n\tdescr=%s\n\ttext=%s' %
      (test_tweet['user']['screen_name'],
       test_tweet['user']['name'],
       test_tweet['user']['description'],
       test_tweet['text']))

test tweet:
	screen_name=caron_shelly
	name=Shelly Caron
	descr=I have 3 Wonderful children and a Wonderful Husband. I am a Home and School member at my childrens school and love being busy!
	text=My son #lovesschool this year #teachersmatter #bonomagic @BonaventurePS @TVDSB  https://t.co/p0HwNheLJr


In [141]:
import re

def tokenize(string, lowercase, keep_punctuation, prefix,
             collapse_urls, collapse_mentions):
    """ Split a tweet into tokens."""
    if not string:
        return []
    if lowercase:
        string = string.lower()
    tokens = []
    if collapse_urls:
        string = re.sub('http\S+', 'THIS_IS_A_URL', string)
    if collapse_mentions:
        string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
    if keep_punctuation:
        tokens = string.split()
    else:
        tokens = re.sub('\W+', ' ', string).split()
    if prefix:
        tokens = ['%s%s' % (prefix, t) for t in tokens]
    return tokens

In [142]:
tokenize(test_tweet['user']['description'], lowercase=True,
         keep_punctuation=True, prefix='d=',
         collapse_urls=True, collapse_mentions=True)

[u'd=i',
 u'd=have',
 u'd=3',
 u'd=wonderful',
 u'd=children',
 u'd=and',
 u'd=a',
 u'd=wonderful',
 u'd=husband.',
 u'd=i',
 u'd=am',
 u'd=a',
 u'd=home',
 u'd=and',
 u'd=school',
 u'd=member',
 u'd=at',
 u'd=my',
 u'd=childrens',
 u'd=school',
 u'd=and',
 u'd=love',
 u'd=being',
 u'd=busy!']

In [146]:
tokenize(test_tweet['text'], lowercase=True, keep_punctuation=True,
         prefix='t=',
         collapse_urls=True, collapse_mentions=False)

[u't=my',
 u't=son',
 u't=#lovesschool',
 u't=this',
 u't=year',
 u't=#teachersmatter',
 u't=#bonomagic',
 u't=@bonaventureps',
 u't=@tvdsb',
 u't=THIS_IS_A_URL']

In [147]:
def tweet2tokens(tweet, use_descr=True, lowercase=True,
                 keep_punctuation=True, descr_prefix='d=',
                 collapse_urls=True, collapse_mentions=True):
    """ Convert a tweet into a list of tokens, from the tweet text and optionally the
    user description. """
    tokens = tokenize(tweet['text'], lowercase, keep_punctuation, None,
                       collapse_urls, collapse_mentions)
    if use_descr:
        tokens.extend(tokenize(tweet['user']['description'], lowercase,
                               keep_punctuation, descr_prefix,
                               collapse_urls, collapse_mentions))
    return tokens

In [148]:
# for enumerating all possible arguments of tweet2tokens
# https://docs.python.org/2/library/itertools.html#itertools.product
from itertools import product

use_descr_opts = [True, False]
lowercase_opts = [True, False]
keep_punctuation_opts = [True, False]
descr_prefix_opts = ['d=', '']
url_opts = [True, False]
mention_opts = [True, False]

argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']
option_iter = product(use_descr_opts, lowercase_opts,
                       keep_punctuation_opts,
                       descr_prefix_opts, url_opts,
                       mention_opts)
for options in option_iter:
    print '  '.join('%s=%s' % (name, opt) for name, opt in zip(argnames, options))
    print
    print '  '.join(tweet2tokens(test_tweet, *options)), '\n----\n'

use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=True

my  son  #lovesschool  this  year  #teachersmatter  #bonomagic  THIS_IS_A_MENTION  THIS_IS_A_MENTION  THIS_IS_A_URL  d=i  d=have  d=3  d=wonderful  d=children  d=and  d=a  d=wonderful  d=husband.  d=i  d=am  d=a  d=home  d=and  d=school  d=member  d=at  d=my  d=childrens  d=school  d=and  d=love  d=being  d=busy! 
----

use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=False

my  son  #lovesschool  this  year  #teachersmatter  #bonomagic  @bonaventureps  @tvdsb  THIS_IS_A_URL  d=i  d=have  d=3  d=wonderful  d=children  d=and  d=a  d=wonderful  d=husband.  d=i  d=am  d=a  d=home  d=and  d=school  d=member  d=at  d=my  d=childrens  d=school  d=and  d=love  d=being  d=busy! 
----

use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=True

my  son  #lovesschool  this  year  #teachersmatter  #bonomagic  THIS_IS_A_MENTION  THIS_IS_A_MENTION  https://t.co/p0hwnheljr  d=i  d=have  d=3

In [149]:
# Let's tokenize all tweets.
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=False, descr_prefix='d=',
                            collapse_urls=True, collapse_mentions=True)
              for t in tweets]

In [150]:
print tokens_list[200]

[u'my', u'son', u'lovesschool', u'this', u'year', u'teachersmatter', u'bonomagic', u'THIS_IS_A_MENTION', u'THIS_IS_A_MENTION', u'THIS_IS_A_URL', u'd=i', u'd=have', u'd=3', u'd=wonderful', u'd=children', u'd=and', u'd=a', u'd=wonderful', u'd=husband', u'd=i', u'd=am', u'd=a', u'd=home', u'd=and', u'd=school', u'd=member', u'd=at', u'd=my', u'd=childrens', u'd=school', u'd=and', u'd=love', u'd=being', u'd=busy']


In [151]:
# Store these in a sparse matrix.

#1) Create a vocabulary (dict from term->index)

# https://docs.python.org/2/library/collections.html#collections.defaultdict
from collections import defaultdict

def make_vocabulary(tokens_list):
    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.
    for tokens in tokens_list:
        for token in tokens:
            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.
    print '%d unique terms in vocabulary' % len(vocabulary)
    return vocabulary

In [152]:
vocabulary = make_vocabulary(tokens_list)

18813 unique terms in vocabulary


In [153]:
# term->index
vocabulary.items()[:10]

[(u'raining', 13284),
 (u'd=tremaine', 17369),
 (u'gorams', 18777),
 (u'd=kiss', 9732),
 (u'd=tattoo', 15718),
 (u'foul', 8693),
 (u'd=fair', 12982),
 (u'd=racer', 10865),
 (u'betch', 17951),
 (u'woods', 13929)]

In [154]:
# How big is vocabulary if we keep punctuation?
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=True, descr_prefix='d=',
                            collapse_urls=True, collapse_mentions=True)
              for t in tweets]

vocabulary = make_vocabulary(tokens_list)

26723 unique terms in vocabulary


In [155]:
# How big is vocabulary if we keep punctuation and urls?
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=True, descr_prefix='d=',
                            collapse_urls=False, collapse_mentions=True)
              for t in tweets]

vocabulary = make_vocabulary(tokens_list)

28450 unique terms in vocabulary


In [156]:
# How big is vocabulary if we keep punctuation and urls and mentions?
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=True, descr_prefix='d=',
                            collapse_urls=False, collapse_mentions=False)
              for t in tweets]

vocabulary = make_vocabulary(tokens_list)

31831 unique terms in vocabulary


## Feature Vector Matrix

Create a matrix $X$ where $X[i,j]$ is the frequency of term $j$ in tweet $i$.

$$
X = \begin{pmatrix}
~ & \hbox{term}_1 & \hbox{term}_2 & \hbox{term}_3 & \hbox{term}_4 \\
\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\
\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\
\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\
\end{pmatrix}
$$



## Sparse Matrices

$$
X = \begin{pmatrix}
~ & \hbox{term}_1 & \hbox{term}_2 & \hbox{term}_3 & \hbox{term}_4 \\
\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\
\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\
\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\
\end{pmatrix}
$$

$X$ is mostly $0$ for text problems.

## List of List (LIL) Matrix

Store a linked list of (index, value) pairs for each row.

$$
X = \begin{pmatrix}
\hbox{tweet}_1 & (0, 1)\\
\hbox{tweet}_2 & (3,2)\\
\hbox{tweet}_3 & (0,1), (1,1)\\
\end{pmatrix}
$$

**Advantage:** Fast to construct: append to list in constant time.

**Disadvantage:** Slow random access for matrix-vector product.

E.g., $\hat{z} = X\cdot \hat{\beta}$ to classify tweets using a learned weight vector $\beta$

$\hat{z}[i] = \sum_j X[i,j] * \beta[j]$

## Compressed Sparse Row (CSR) Matrix


$$
X = \begin{pmatrix}
~ & \hbox{term}_1 & \hbox{term}_2 & \hbox{term}_3 & \hbox{term}_4 \\
\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\
\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\
\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\
\end{pmatrix}
$$

CSR Matrix is an object with three attributes: 
- **val:** $\{1,2,1,1\}$  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *list of all non-zero values*  
- **col_ind:** $\{0,3,0,1\}$ &nbsp; *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) 
- **row_ptr:** $\{0,1,2\}$ &nbsp;&nbsp;&nbsp; *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])

Allows efficient row access (good for us, since each row is a tweet)

In [59]:
# Convert features to a sparse matrix X.
# X[i,j] is the frequency of term j in tweet i
# 
from scipy.sparse import lil_matrix

def make_feature_matrix(tokens_list, vocabulary):
    X = lil_matrix((len(tweets), len(vocabulary)))
    for i, tokens in enumerate(tokens_list):
        for token in tokens:
            j = vocabulary[token]
            X[i,j] += 1
    return X.tocsr()  # convert to CSR for more efficient random access.

In [157]:
X = make_feature_matrix(tokens_list, vocabulary)
print 'shape of X:', X.shape

shape of X: (5000, 31831)


In [158]:
help(X)

Help on csr_matrix in module scipy.sparse.csr object:

class csr_matrix(scipy.sparse.compressed._cs_matrix, scipy.sparse.sputils.IndexMixin)
 |  Compressed Sparse Row matrix
 |  
 |  This can be instantiated in several ways:
 |      csr_matrix(D)
 |          with a dense matrix or rank-2 ndarray D
 |  
 |      csr_matrix(S)
 |          with another sparse matrix S (equivalent to S.tocsr())
 |  
 |      csr_matrix((M, N), [dtype])
 |          to construct an empty matrix with shape (M, N)
 |          dtype is optional, defaulting to dtype='d'.
 |  
 |      csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
 |          where ``data``, ``row_ind`` and ``col_ind`` satisfy the
 |          relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
 |  
 |      csr_matrix((data, indices, indptr), [shape=(M, N)])
 |          is the standard CSR representation where the column indices for
 |          row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
 |          corresponding value

In [159]:
# How is tweet 200 stored?
X[200]

<1x31831 sparse matrix of type '<type 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [160]:
help(X[200].nonzero)

Help on method nonzero in module scipy.sparse.base:

nonzero(self) method of scipy.sparse.csr.csr_matrix instance
    nonzero indices
    
    Returns a tuple of arrays (row,col) containing the indices
    of the non-zero elements of the matrix.
    
    Examples
    --------
    >>> from scipy.sparse import csr_matrix
    >>> A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])
    >>> A.nonzero()
    (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))



In [161]:
# non-zero indices of terms used in tweet 200.
X[200].nonzero()[1]

array([  44,   65,   87,  171,  178,  219,  271,  375,  424,  746,  807,
       1313, 1517, 2068, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670,
       2671, 2672, 2673, 2674, 2675, 2676], dtype=int32)

In [162]:
# term counts for tweet 200.
X[200].data

array([ 2.,  3.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  2.,  1.,
        1.,  1.])

In [163]:
# What word does each term index correspond to?
# Convert term->index dict into index->term dict
index2term = dict((i, t) for t, i in vocabulary.items())
print index2term[44]
print X[200, 44]
# So, the term "a" (index 44) appears in the user's description 2 times.

d=a
2.0


In [164]:
# d=and appears three times.
print index2term[65]
print X[200, 65]

d=and
3.0


How do CSR matrices access row values?

Recall:

CSR Matrix is an object with three attributes: 
- **val:** $\{1,2,1,1\}$  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *list of all non-zero values*  
- **col_ind:** $\{0,3,0,1\}$ &nbsp; *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) 
- **row_ptr:** $\{0,1,2\}$ &nbsp;&nbsp;&nbsp; *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])

In [165]:
# Recall: numpy array slices.
import numpy as np
a = np.array([0, 100, 200, 300, 400, 500])
a[2:4]  # get elements at positions 2,3

array([200, 300])

In [166]:
print 'tweet 200 starts at col_ind=', X.indptr[200]
print 'tweet 201 starts at col_ind=', X.indptr[201]
print 'so, the columns that are non-zero for tweet 200 are:'
print X.indices[X.indptr[200]:X.indptr[201]]
print 'and the data associated with those cells are:'
print X.data[X.indptr[200]:X.indptr[201]]

tweet 200 starts at col_ind= 4100
tweet 201 starts at col_ind= 4128
so, the columns that are non-zero for tweet 200 are:
[  44   65   87  171  178  219  271  375  424  746  807 1313 1517 2068 2663
 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676]
and the data associated with those cells are:
[ 2.  3.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  2.  1.  1.  1.  2.  1.  1.  1.]


In [167]:
print 'tweet 0:\n', X[0], '\n'
print 'tweet 1:\n', X[1], '\n'
print 'tweet 2:\n', X[2]

tweet 0:
  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	2.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (0, 11)	1.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	1.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	1.0
  (0, 26)	1.0 

tweet 1:
  (0, 13)	1.0
  (0, 27)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 30)	1.0
  (0, 31)	1.0
  (0, 32)	1.0
  (0, 33)	1.0
  (0, 34)	1.0
  (0, 35)	1.0
  (0, 36)	1.0
  (0, 37)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 40)	1.0
  (0, 41)	1.0
  (0, 42)	1.0
  (0, 43)	1.0
  (0, 44)	1.0
  (0, 45)	1.0 

tweet 2:
  (0, 46)	1.0
  (0, 47)	1.0
  (0, 48)	1.0
  (0, 49)	1.0
  (0, 50)	1.0
  (0, 51)	1.0
  (0, 52)	1.0
  (0, 53)	1.0
  (0, 54)	1.0
  (0, 55)	1.0
  (0, 56)	1.0
  (0, 57)	1.0
  (0, 58)	1.0


**Efficient matrix vector product:**

In [168]:
# Compute z = X * \beta, where X is a CSR matrix.
import numpy as np
beta = np.ones(len(vocabulary))  # assume Beta = vector of 1s
z = np.zeros(len(tweets))
for i in range(len(tweets)):  # for each row.
    for j in range(X.indptr[i], X.indptr[i+1]): # for each col.
        colidx = X.indices[j]
        z[i] += beta[colidx] * X.data[j]
print 'X * beta for tweet 200=', z[200]
print 'which is the same as', X[200].sum()

X * beta for tweet 200= 34.0
which is the same as 34.0


**4.) Create a list of gender labels.**

In [169]:
# y is a 1d numpy array of gender labels.
# Let 1=Female, 0=Male.
import numpy as np

def get_gender(tweet, male_names, female_names):
    name = get_first_name(tweet)
    if name in female_names:
        return 1
    elif name in male_names:
        return 0
    else:
        return -1
    
y = np.array([get_gender(t, male_names, female_names) for t in tweets])
print 'gender labels:', Counter(y).items()

gender labels: [(0, 2509), (1, 2491)]


**5.) Fit a Logistic Regression classifier to predict gender from profile/tweet.**

In [170]:
# Do 5-fold cross-validation
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def do_cross_val(X, y, nfolds):
    """ Compute average cross-validation acccuracy."""
    cv = KFold(len(y), nfolds)
    accuracies = []
    for train_idx, test_idx in cv:
        clf = LogisticRegression()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        accuracies.append(acc)
    avg = np.mean(accuracies)
    return avg

In [173]:
print 'avg accuracy', do_cross_val(X, y, 5)

avg accuracy 0.7022


In [120]:
# Fitting model with CSR much, much faster than with LIL.
from timeit import timeit
print 'CSR TIME'
timeit("do_cross_val(X.tocsr(), y, 2)", number=5,
       setup="from __main__ import do_cross_val, X, y")

CSR TIME


0.4033999443054199

In [122]:
print 'LIL TIME'
timeit("do_cross_val(X.tolil(), y, 2)", number=5,
       setup="from __main__ import do_cross_val, X, y")

LIL TIME


214.12728118896484

In [123]:
# How does tokenization affect accuracy?
# Collapse urls and mentions; ignore description prefix.
def run_all(tweets, use_descr=True, lowercase=True,
            keep_punctuation=True, descr_prefix=None,
            collapse_urls=True, collapse_mentions=True):
    
    tokens_list = [tweet2tokens(t, use_descr, lowercase,
                            keep_punctuation, descr_prefix,
                            collapse_urls, collapse_mentions)
                  for t in tweets]
    vocabulary = make_vocabulary(tokens_list)
    X = make_feature_matrix(tokens_list, vocabulary)
    acc = do_cross_val(X, y, 5)
    print 'acc=', acc
    return acc

In [124]:
argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']
option_iter = product(use_descr_opts, lowercase_opts,
                       keep_punctuation_opts,
                       descr_prefix_opts, url_opts,
                       mention_opts)
results = []
for options in option_iter:
    option_str = '\t'.join('%s=%s' % (name, opt) for name, opt in zip(argnames, options))
    print option_str
    acc = run_all(tweets, *options)
    results.append((acc, options))
    print

use_descr=True	lower=True	punct=True	prefix=d=	url=True	mention=True
26723 unique terms in vocabulary
acc= 0.6994

use_descr=True	lower=True	punct=True	prefix=d=	url=True	mention=False
30104 unique terms in vocabulary
acc= 0.7018

use_descr=True	lower=True	punct=True	prefix=d=	url=False	mention=True
28450 unique terms in vocabulary
acc= 0.7002

use_descr=True	lower=True	punct=True	prefix=d=	url=False	mention=False
31831 unique terms in vocabulary
acc= 0.7022

use_descr=True	lower=True	punct=True	prefix=	url=True	mention=True
23498 unique terms in vocabulary
acc= 0.6918

use_descr=True	lower=True	punct=True	prefix=	url=True	mention=False
26851 unique terms in vocabulary
acc= 0.6958

use_descr=True	lower=True	punct=True	prefix=	url=False	mention=True
25226 unique terms in vocabulary
acc= 0.6924

use_descr=True	lower=True	punct=True	prefix=	url=False	mention=False
28579 unique terms in vocabulary
acc= 0.698

use_descr=True	lower=True	punct=False	prefix=d=	url=True	mention=True
18813 uniqu

In [137]:
for r in sorted(results, reverse=True):
    print '%.4f' % r[0], '  '.join('%s=%s' % (name, opt) for name, opt in zip(argnames, r[1]))

0.7090 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=False
0.7078 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=True
0.7068 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=False
0.7066 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=True
0.7054 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=False
0.7050 use_descr=True  lower=False  punct=False  prefix=  url=True  mention=False
0.7048 use_descr=True  lower=False  punct=False  prefix=  url=False  mention=False
0.7034 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=True
0.7032 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=False
0.7032 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=True
0.7030 use_descr=True  lower=False  punct=True  prefix=d=  url=True  mention=False
0.7022 use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=False
0

## Error Analysis

- Which ones do we get wrong?
- Are there obvious reasons?