In [40]:
#Importing required modules
from lxml import objectify
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale
from sklearn import grid_search
from sklearn.tree import DecisionTreeClassifier

In [2]:
#Opening the TASS 2015 CSV file
tassdb = pd.read_csv('TASS_tweet_database.csv', encoding='latin1')

In [3]:
tassdb.head()

Unnamed: 0,tweetid,user,content,date,lang,value,topic
0,1.42378e+17,jesusmarana,"Portada 'Público', viernes. Fabra al banquillo...",02/12/2011 0:03,es,N,política
1,1.42379e+17,EvaORegan,"Grande! RT @veronicacalderon ""El periodista es...",02/12/2011 0:06,es,NONE,política
2,1.42379e+17,LosadaPescador,Gonzalo Altozano tras la presentación de su li...,02/12/2011 0:06,es,P+,otros
3,1.4238e+17,mgilguerrero,"Mañana en Gaceta: TVE, la que pagamos tú y yo,...",02/12/2011 0:09,es,N,entretenimiento
4,1.42381e+17,pedroj_ramirez,Qué envidia @mfcastineiras: Pedro mañana x la...,02/12/2011 0:14,es,NONE,otros


In [4]:
#getting the root node from the XML file
tassdb.content[0]

u"Portada 'P\xfablico', viernes. Fabra al banquillo por 'orden' del Supremo; Wikileaks 'retrata' a 160 empresas esp\xedas. http://t.co/YtpRU0fd"

In [5]:
tassdb = (tassdb[tassdb.value != 'NONE'])

In [6]:
len(tassdb)

45396

In [7]:
tassdb.head()

Unnamed: 0,tweetid,user,content,date,lang,value,topic
0,1.42378e+17,jesusmarana,"Portada 'Público', viernes. Fabra al banquillo...",02/12/2011 0:03,es,N,política
2,1.42379e+17,LosadaPescador,Gonzalo Altozano tras la presentación de su li...,02/12/2011 0:06,es,P+,otros
3,1.4238e+17,mgilguerrero,"Mañana en Gaceta: TVE, la que pagamos tú y yo,...",02/12/2011 0:09,es,N,entretenimiento
5,1.42383e+17,mgilguerrero,Más mañana en Gaceta. Amaiur depende de Uxue B...,02/12/2011 0:20,es,N,política
6,1.42383e+17,SSantiagosegura,"Muy buenas noches followercetes, mañana va a s...",02/12/2011 0:20,es,P+,otros


In [8]:
tassdb.value.value_counts()

P+     23545
N      12839
N+      5575
P       1939
NEU     1498
Name: value, dtype: int64

In [9]:
values = {'P+': 'P', 'P': 'P', 'NEU': 'NEU', 'N': 'N', 'N+': 'N'}

In [10]:
tassdb.value.replace(values, inplace=True)

In [11]:
tassdb.head()

Unnamed: 0,tweetid,user,content,date,lang,value,topic
0,1.42378e+17,jesusmarana,"Portada 'Público', viernes. Fabra al banquillo...",02/12/2011 0:03,es,N,política
2,1.42379e+17,LosadaPescador,Gonzalo Altozano tras la presentación de su li...,02/12/2011 0:06,es,P,otros
3,1.4238e+17,mgilguerrero,"Mañana en Gaceta: TVE, la que pagamos tú y yo,...",02/12/2011 0:09,es,N,entretenimiento
5,1.42383e+17,mgilguerrero,Más mañana en Gaceta. Amaiur depende de Uxue B...,02/12/2011 0:20,es,N,política
6,1.42383e+17,SSantiagosegura,"Muy buenas noches followercetes, mañana va a s...",02/12/2011 0:20,es,P,otros


In [12]:
tassdb.value.value_counts()

P      25484
N      18414
NEU     1498
Name: value, dtype: int64

In [13]:
#First step to standarize the text data is to unifiy all text into lower cases
tassdb.content = tassdb.content.str.lower()

In [14]:
tassdb.content = tassdb.content.str.replace('/ ' , '/')

In [15]:
tassdb.content = tassdb.content.str.replace('w. ', 'w.')

In [16]:
tassdb = tassdb.reset_index()
del(tassdb['index']) 

In [17]:
tassdb.head()

Unnamed: 0,tweetid,user,content,date,lang,value,topic
0,1.42378e+17,jesusmarana,"portada 'público', viernes. fabra al banquillo...",02/12/2011 0:03,es,N,política
1,1.42379e+17,LosadaPescador,gonzalo altozano tras la presentación de su li...,02/12/2011 0:06,es,P,otros
2,1.4238e+17,mgilguerrero,"mañana en gaceta: tve, la que pagamos tú y yo,...",02/12/2011 0:09,es,N,entretenimiento
3,1.42383e+17,mgilguerrero,más mañana en gaceta. amaiur depende de uxue b...,02/12/2011 0:20,es,N,política
4,1.42383e+17,SSantiagosegura,"muy buenas noches followercetes, mañana va a s...",02/12/2011 0:20,es,P,otros


In [18]:
tassdb.content[0]

u"portada 'p\xfablico', viernes. fabra al banquillo por 'orden' del supremo; wikileaks 'retrata' a 160 empresas esp\xedas. http://t.co/ytpru0fd"

In [19]:
#Then from the strings library we import the punctuation list
#and from nklt the stop words list in this case for spanish
#I decided to add some aadditional terms that I want to remove in order to obatin a cleaner
#list of the most used words
punctuation = list(string.punctuation)
punctuation2 = punctuation + [u"'" + u'"' + u'\xa1', u'\xbf']
stop = stopwords.words('spanish') + punctuation2 + ['rt', 'via', u'', u'\u2026,', u'\u2026', u'...', u'...,']

In [20]:
#To obtain the actual text for each tweet we split each word
tasssplit = tassdb.content.str.split(' ')

In [21]:
tasstweetlist = []
for i in range (0, len(tasssplit)):
    a= [term.rstrip(''.join(punctuation)) for term in tasssplit[i]]
    tasstweetlist.append(a)

In [22]:
tasstweetlist1 = []
for i in range (0, len(tasstweetlist)):
    tasstweetlist1.append([term for term in tasstweetlist[i] if term not in stop if "@" not in term\
                           if '#' not in term if term.startswith('http:') == False])

In [23]:
tasstweetlist2 = []
for i in range (0, len(tasstweetlist1)):
    a= [term.lstrip(''.join(punctuation2)) for term in tasstweetlist1[i]]
    tasstweetlist2.append(a)

In [24]:
tasstweetlist2ws = []
for i in range (0, len(tasstweetlist2)):
    a = " ".join(tasstweetlist2[i])
    tasstweetlist2ws.append(a)

In [25]:
tassfinal = pd.concat([pd.DataFrame(tasstweetlist2ws), tassdb.value], axis=1)
tassfinal.columns = ['text', 'values']

In [67]:
tassfinal.to_csv('Tass_final.csv', encoding='utf-8')

In [57]:
tassfinal.text[28365]

u'miedo da si dieran goya fidel castro saliera agradecer premio unas breves palabras'

In [58]:
vectorizer = CountVectorizer(min_df=5, max_df = 0.5, ngram_range=(1,2))
vectors = vectorizer.fit_transform(tassfinal['text'].tolist())

In [59]:
len(tassfinal['values'])

45396

In [60]:
X_train, X_validation, y_train, y_validation = train_test_split(vectors, tassfinal['values'], \
                                                                test_size=0.25, random_state=42)

In [61]:
lin_svc = svm.LinearSVC(max_iter=4000, multi_class='ovr', random_state=None, penalty='l2', \
                       loss='squared_hinge', C=.1, tol=0.0001)
lin_svc.fit(X_train, y_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=4000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [62]:
lin_svc.score(X_validation, y_validation)

0.88307339853731603

In [54]:
Ran_for = RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=None, max_leaf_nodes=None, bootstrap=True, \
                                 n_jobs=5, random_state=None, max_features='sqrt')
Ran_for.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=5,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [53]:
Ran_for.score(X_validation, y_validation)

0.85672746497488761

In [63]:
dtclass = DecisionTreeClassifier(criterion= 'entropy', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, \
                                    min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=None, max_leaf_nodes=None)
dtclass.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [64]:
dtclass.score(X_validation, y_validation)

0.79866067494933479

In [135]:
parameters = {'C': (0.1 , 0.5, 0.7, 1),
              'loss':('hinge', 'squared_hinge'), 
              'max_iter': (500, 1000, 2000, 4000)}

In [143]:
svr = svm.LinearSVC(multi_class='ovr', random_state=None, penalty='l2', tol=0.0001)
clf = grid_search.GridSearchCV(estimator=svr, param_grid=parameters, scoring=None, n_jobs= 4)
clf.fit(vectors, tassfinal['values'])

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'loss': ('hinge', 'squared_hinge'), 'C': (0.1, 0.5, 0.7, 1), 'max_iter': (500, 1000, 2000, 4000)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [144]:
clf.best_estimator_ 

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [145]:
clf.best_score_ 

0.85150674068199839