# Blog post 4. Are you talking fashion? Building a fashion classifier for Twitter data

The explanation of this implementation can be found at: http://www.rosariomgomez.me/ <br><br>
__Index:__<br>
1. Collecting data<br>
2. Vectorize tweets<br>
3. Machine learning algorithms<br>
4. Miscalssifications with Logistic Regression<br>

# 1. Collecting the data

In [1]:
import pandas as p
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import pos_tag

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix as cm

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## 1.3. Building the data sets

In [2]:
train = p.read_csv('./sub_obj_data/train_new.csv', usecols=(['class', 'text'])).dropna()
test  = p.read_csv('./sub_obj_data/test_ds.csv', usecols=(['class', 'text'])).dropna()
train = train.reindex(np.random.permutation(train.index))

In [46]:
from sklearn.cross_validation import train_test_split

training_data, test_data, training_labels, test_labels = train_test_split(train['text'].values, train['class'].values, test_size=0.3, random_state=0) #70-30 split
print len(training_data), len(test_data)

4162 1784


In [4]:
#we will split the training data into 2 more subsets: development and evaluation in order to first estimate the pipeline parameters
#with the grid search and then evaluate the accuracy of the model with cross validation
dev_data, eval_data, dev_labels, eval_labels = train_test_split(train['text'].values, train['class'].values, test_size=0.5, random_state=0)
print len(dev_data), len(eval_data)

2973 2973


# 2-3. Vectorize tweets and Machine learning

In [5]:
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report

'''helper function for displaying best found features on grid_search'''
def print_grid_search_metrics(gs):
    print("Best score: %0.3f" % gs.best_score_)
    print("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
'''helper function for displaying the algorithm metrics'''
def print_metrics(model_name, y_labels, y_predicted):
    
    print "MODEL: " + model_name
    print 'Test Accuracy: ' + str(metrics.accuracy_score(y_labels, y_predicted))
    
    print '\nClassification report:'
    print classification_report(y_labels, y_predicted, target_names=['non-fashion tweets', 'fashion tweets'])
    
    print '\nConfusion matrix:'
    print metrics.confusion_matrix(y_labels, y_predicted)
    
'''helper to display the most informative features for each group'''
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_names = sorted(zip(clf.coef_[0], feature_names))
    top_features = zip(coefs_with_names[:n], coefs_with_names[:-(n + 1):-1])  #top features for both groups
    for (coef_1, fn_1), (coef_2, fn_2) in top_features:
        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

#add some tweets specific stop words to the built-in english list
remove = ['amp', 'cc', 'did', 'don', 'rt', 'll', 'oh', 've', 'yes', 'let', 'going', 'via', 're', 'tweet' ]
stop = list(ENGLISH_STOP_WORDS) + remove

In [7]:
import re
from nltk.stem.snowball import *
stemmer = SnowballStemmer('english')

class NoUrls_TfidfVectorizer(TfidfVectorizer):
    def build_preprocessor(self):
        url_pattern = re.compile(r'http(s?)://[\w./]+')
        pic_pattern = re.compile(r'pic.twitter.com/[\w.]+')
        preprocessor = super(NoUrls_TfidfVectorizer, self).build_preprocessor()
        return lambda doc: (pic_pattern.sub('', url_pattern.sub('', preprocessor(doc)) ))
    
class NoUrls_Stemmed_TfidfVectorizer(TfidfVectorizer):
    def build_preprocessor(self):
        url_pattern = re.compile(r'http(s?)://[\w./]+')
        pic_pattern = re.compile(r'pic.twitter.com/[\w.]+')
        preprocessor = super(NoUrls_Stemmed_TfidfVectorizer, self).build_preprocessor()
        return lambda doc: (pic_pattern.sub('', url_pattern.sub('', preprocessor(doc)) ))
    
    def build_tokenizer(self):
        tokenizer = super(NoUrls_Stemmed_TfidfVectorizer, self).build_tokenizer()
        return lambda doc: (stemmer.stem(w) for w in tokenizer(doc))

In [8]:
#ngram_range: lower and upper boundary of the range of n-values for different n-grams to be extracted
#I use words and bi-grams (to consider for example "New York" as unique feature)
#min_df: ignore terms that have a term frequency strictly lower than the given threshold
#because tweets are very short, we consider min_df=1 (consider all)
tfidf = NoUrls_TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words=stop, strip_accents='unicode')

In [9]:
#Example process with an specific tweet
tweet = u'rt @harpersbazaar The top 7 swimsuit trends of the season—which will you wear? http://hbazaar.co/60109eOj pic.twitter.com/7J2hR4auMc #pretty'
print 'Preprocess:', tfidf.build_preprocessor()(tweet)
print
print 'Analyze:', tfidf.build_analyzer()(tweet)
tfidf.fit_transform([tweet])
tfidf.vocabulary_

Preprocess: rt @harpersbazaar the top 7 swimsuit trends of the season—which will you wear?   #pretty

Analyze: [u'harpersbazaar', u'swimsuit', u'trends', u'season', u'wear', u'pretty', u'harpersbazaar swimsuit', u'swimsuit trends', u'trends season', u'season wear', u'wear pretty']


{u'harpersbazaar': 0,
 u'harpersbazaar swimsuit': 1,
 u'pretty': 2,
 u'season': 3,
 u'season wear': 4,
 u'swimsuit': 5,
 u'swimsuit trends': 6,
 u'trends': 7,
 u'trends season': 8,
 u'wear': 9,
 u'wear pretty': 10}

## 3.1. Bernoulli Naive Bayes classifier

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score

In [11]:
from sklearn.naive_bayes import BernoulliNB
Bern_classifier = BernoulliNB(binarize=None)
Bern_pipeline = Pipeline([('tfidf', tfidf), ('clf', Bern_classifier)])

In [12]:
Bern_classifier.get_params()

{'alpha': 1.0, 'binarize': None, 'class_prior': None, 'fit_prior': True}

### 3.1.1. Feature selection

In [13]:
#estimate the tfidf and classifier parameters by using grid search with a nested cross validation

parameters = {
    'tfidf__max_df': (0.8, 1.0),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.1, 0.5, 1)
}

bern_gs = GridSearchCV(Bern_pipeline, parameters, cv=5, verbose=1, refit=False)
bern_gs.fit(dev_data, dev_labels)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   15.5s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   38.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
            dtype=<type 'numpy.int64'>, encoding=u'utf-8',
            input=u'content', lowercase=True, max_df=1.0,
            max_features=None, min_df=1, ngram_range=(1, 2), norm=u'l2',
            pr...vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__max_df': (0.8, 1.0), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2)), 'clf__alpha': (0.1, 0.5, 1)},
       pre_dispatch='2*n_jobs', refit=False, scoring=None, verbose=1)

In [14]:
print_grid_search_metrics(bern_gs)

Best score: 0.901
Best parameters set:
	clf__alpha: 0.1
	tfidf__max_df: 0.8
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l2'


### 3.1.2. Model evaluation

In [15]:
#build the model with the best parameters set from the grid search
Bern_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
Bern_classifier = BernoulliNB(alpha=0.5, binarize=None)
Bern_pipeline = Pipeline([('tfidf', Bern_vect), ('clf', Bern_classifier)])

In [16]:
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(Bern_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))

10-fold cross validation accuracy: 0.90177799062


### 3.1.3. Test metrics

In [17]:
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = Bern_pipeline.fit(train['text'].values, train['class'].values)
y_Bern_predicted = Bern_pipeline.predict(test['text'].values)

In [18]:
print_metrics("Bernoulli Naive Bayes", test['class'].values, y_Bern_predicted)

MODEL: Bernoulli Naive Bayes
Test Accuracy: 0.586330935252

Classification report:
                    precision    recall  f1-score   support

non-fashion tweets       0.69      0.32      0.43       139
    fashion tweets       0.56      0.86      0.67       139

       avg / total       0.62      0.59      0.55       278


Confusion matrix:
[[ 44  95]
 [ 20 119]]


In [19]:
show_most_informative_features(Bern_vect, Bern_classifier)

	-8.6911	000000         		-4.0973	just           
	-8.6911	000758         		-4.1634	good           
	-8.6911	00pm           		-4.4203	quot           
	-8.6911	01             		-4.4733	work           
	-8.6911	0118704263     		-4.5813	love           
	-8.6911	02             		-4.5871	like           
	-8.6911	03             		-4.6023	sleep          
	-8.6911	039            		-4.6317	know           
	-8.6911	03am           		-4.6661	day            
	-8.6911	05             		-4.7297	lol            
	-8.6911	05ipoztuupq    		-4.7442	today          
	-8.6911	06             		-4.7522	time           
	-8.6911	07             		-4.7570	twitter        
	-8.6911	0900           		-4.7686	really         
	-8.6911	09109839513    		-4.7812	night          
	-8.6911	09am           		-4.8025	got            
	-8.6911	09left         		-4.8413	bed            
	-8.6911	0e6a80e8aea4   		-4.8630	sad            
	-8.6911	10037          		-4.8636	im             
	-8.6911	1039           		-4.9288	thanks         


## 3.2. Logistic regression classifier

In [20]:
from sklearn.linear_model import LogisticRegression
logistic_tfidf = NoUrls_TfidfVectorizer(min_df=1, stop_words=stop, strip_accents='unicode')
logistic_classifier = LogisticRegression()
logistic_pipeline = Pipeline([('tfidf', logistic_tfidf), ('clf', logistic_classifier)])

In [21]:
logistic_classifier.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### 3.2.1. Feature selection

In [22]:
parameters = {
    'tfidf__max_df': (0.8, 1.0),
    'tfidf__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': (1, 5, 7)
}

logistic_gs = GridSearchCV(logistic_pipeline, parameters, verbose=1, refit=False)
logistic_gs.fit(dev_data, dev_labels)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   17.8s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:   26.2s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
            dtype=<type 'numpy.int64'>, encoding=u'utf-8',
            input=u'content', lowercase=True, max_df=1.0,
            max_features=None, min_df=1, ngram_range=(1, 1), norm=u'l2',
            pr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__max_df': (0.8, 1.0), 'clf__C': (1, 5, 7), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2))},
       pre_dispatch='2*n_jobs', refit=False, scoring=None, verbose=1)

In [23]:
print_grid_search_metrics(logistic_gs)

Best score: 0.905
Best parameters set:
	clf__C: 7
	tfidf__max_df: 0.8
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l2'


### 3.2.2. Model evaluation

In [41]:
#build the model with the best parameters set from the grid search
logistic_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
logistic_classifier = LogisticRegression(C=7)
logistic_pipeline = Pipeline([('tfidf', logistic_vect), ('clf', logistic_classifier)])

In [42]:
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(logistic_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))

10-fold cross validation accuracy: 0.909176366727


### 3.2.3. Test metrics

In [44]:
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = logistic_pipeline.fit(train['text'].values, train['class'].values)
y_logistic_predicted = logistic_pipeline.predict(test['text'].values)

In [45]:
print_metrics("Logistic Regression", test['class'].values, y_logistic_predicted)

MODEL: Logistic Regression
Test Accuracy: 0.604316546763

Classification report:
                    precision    recall  f1-score   support

non-fashion tweets       0.70      0.37      0.48       139
    fashion tweets       0.57      0.84      0.68       139

       avg / total       0.63      0.60      0.58       278


Confusion matrix:
[[ 51  88]
 [ 22 117]]


In [28]:
show_most_informative_features(logistic_vect, logistic_classifier)

	-9.4061	sunday         		5.3906	quot           
	-9.2219	tomorrow       		5.1342	good           
	-9.1279	saturday       		4.9169	sleep          
	-8.5340	1st            		4.7957	thanks         
	-7.5426	friday         		4.6266	sad            
	-7.0390	2nd            		4.4710	bed            
	-6.4407	monday         		4.2043	twitter        
	-6.2973	nov            		4.1683	nice           
	-5.9665	sun            		4.0886	happy          
	-5.7004	november       		4.0639	yay            
	-5.6654	thursday       		3.9524	sorry          
	-5.6539	4th            		3.8159	lol            
	-5.4749	3rd            		3.7865	thank          
	-5.3899	october        		3.7184	miss           
	-5.0321	tonight        		3.6278	excited        
	-4.8878	sat            		3.4840	ugh            
	-4.6580	january        		3.2006	glad           
	-4.6291	august         		3.1814	love           
	-4.4330	oct            		3.1715	bad            
	-4.2284	march          		3.1295	missed         


## 3.3. Linear SVM

In [29]:
from sklearn.svm import LinearSVC
SVM_tfidf = NoUrls_TfidfVectorizer(min_df=1, stop_words=stop, strip_accents='unicode')
SVM_classifier = LinearSVC()
SVM_pipeline = Pipeline([('tfidf', SVM_tfidf), ('clf', SVM_classifier)])

In [30]:
SVM_classifier.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

### 3.3.1. Feature selection

In [31]:
parameters = {
    'tfidf__max_df': (0.8, 1.0),
    'tfidf__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': (1, 5, 7)
}

SVM_gs = GridSearchCV(SVM_pipeline, parameters, verbose=1, refit=False)
SVM_gs.fit(dev_data, dev_labels)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   17.3s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:   28.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
            dtype=<type 'numpy.int64'>, encoding=u'utf-8',
            input=u'content', lowercase=True, max_df=1.0,
            max_features=None, min_df=1, ngram_range=(1, 1), norm=u'l2',
            pr...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__max_df': (0.8, 1.0), 'clf__C': (1, 5, 7), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2))},
       pre_dispatch='2*n_jobs', refit=False, scoring=None, verbose=1)

In [32]:
print_grid_search_metrics(SVM_gs)

Best score: 0.910
Best parameters set:
	clf__C: 5
	tfidf__max_df: 0.8
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l2'


### 3.3.2. Model evaluation

In [33]:
#build the model with the best parameters set from the grid search
SVM_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
SVM_classifier = LinearSVC(C=5)
SVM_pipeline = Pipeline([('tfidf', SVM_vect), ('clf', SVM_classifier)])

In [34]:
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(SVM_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))

10-fold cross validation accuracy: 0.919262696192


### 3.3.3. Test metrics

In [35]:
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = SVM_pipeline.fit(train['text'].values, train['class'].values)
y_SVM_predicted = SVM_pipeline.predict(test['text'].values)

In [36]:
print_metrics("SVM", test['class'].values, y_SVM_predicted)

MODEL: SVM
Test Accuracy: 0.575539568345

Classification report:
                    precision    recall  f1-score   support

non-fashion tweets       0.64      0.34      0.44       139
    fashion tweets       0.55      0.81      0.66       139

       avg / total       0.60      0.58      0.55       278


Confusion matrix:
[[ 47  92]
 [ 26 113]]


In [37]:
show_most_informative_features(SVM_vect, SVM_classifier)

	-4.7361	tomorrow       		2.8286	night tomorrow 
	-4.5497	sunday         		2.1700	good           
	-4.4437	saturday       		2.1569	quot           
	-3.9741	1st            		2.1058	sleep          
	-3.7482	friday         		1.9974	bed            
	-3.4623	2nd            		1.9591	thanks         
	-3.2018	monday         		1.9496	nice           
	-3.0784	sun            		1.8708	sad            
	-2.9613	thursday       		1.6482	twitter        
	-2.9308	nov            		1.6408	excited        
	-2.8407	3rd            		1.6221	happy          
	-2.6749	4th            		1.6017	lol            
	-2.6586	november       		1.5790	miss           
	-2.5581	sat            		1.5524	sorry          
	-2.4470	tonight        		1.5495	ready weekend  
	-2.4395	october        		1.4798	yay            
	-2.3204	wednesday      		1.4698	thank          
	-2.1830	august         		1.4640	ugh            
	-2.1377	january        		1.4632	missed         
	-2.1304	march          		1.4218	love           


# 4. Wrongly classified tweets with logistic regression

In [38]:
wrong_classified = y_logistic_predicted != test['class'].values

In [39]:
wrong_classified

array([False, False,  True, False,  True,  True, False, False, False,
       False,  True, False,  True, False,  True,  True, False,  True,
       False, False, False, False,  True, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False,  True,  True, False, False,  True, False,
        True,  True, False,  True, False, False, False, False, False,
       False,  True, False, False,  True, False,  True, False,  True,
        True, False, False, False, False,  True,  True, False,  True,
       False, False, False, False, False,  True, False, False, False,
        True, False, False,  True,  True,  True, False,  True, False,
        True, False,  True, False, False,  True,  True, False,  True,
       False, False, False,  True, False,  True,  True,  True,  True,
       False,  True,  True, False, False, False,  True,  True, False,
        True, False, False,  True, False, False, False,  True,  True,
       False,  True,

In [47]:
wrong_data = test_data[wrong_classified == True]
wrong_labels = y_logistic_predicted[wrong_classified == True]

  if __name__ == '__main__':


In [None]:
false_positive = wrong_data[wrong_labels == 1] #labeled as 1 (fashion) when should be 0 (non-fashion)

In [None]:
len(false_positive)

In [None]:
#example tweet
false_positive[3]

In [None]:
false_negative = wrong_data[wrong_labels == 0]  #labeled as 0 (non-fashion) when they should belong to 1 (fashion)

In [None]:
len(false_negative)

In [None]:
#example tweet
false_negative[24]