## Importing Utilities

In [1]:
import pandas as pd
import numpy as np
import os
import re
import spacy
nlp = spacy.load('es_core_news_md')

## Reading Data

In [2]:
train = pd.read_csv('Train_twitter_utf8_csv.csv')
test = pd.read_csv('Test_twitter_utf8_csv.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,tweetid,user,content,date,lang,polarity
0,"0,768213876278165504,OnceBukowski,""-Me caes mu...",,,,,,
1,#¿NOMBRE?,,,,,,
2,-Por qué tan Otako,deja de ser otako,,,,,
3,"-Haber si me muero"",2016-08-23 22:30:35,es,NONE",,,,,,
4,"1,768213567418036224,anahorxn,""@myendlesshazza...",,,,,,


In [4]:
test.head()

Unnamed: 0.1,Unnamed: 0,tweetid,user,content,date,lang,polarity
0,0,7.709766391739514e+17,noseashetero,@noseashetero 1000/10 de verdad a ti que voy a...,2016-08-31 13:28:49,es,P
1,1,7.710924218663895e+17,Templelx,@piscolabisaereo @HistoriaNG @SPosteguillo las...,2016-08-31 21:08:54,es,P
2,"2,771092111429083136,esskuu94,""Al final han si...",,,,,,
3,3,7.710920705724499e+17,__ariadna9,@Jorge_Ruiz14 yo no tengo tiempo para esas cos...,2016-08-31 21:07:30,es,N
4,4,7.710941925086003e+17,_cristtina15_,@_MissChaotic_ ves ese brillo? es un coso que ...,2016-08-31 21:15:56,es,N


## Taking only Content and Polarity

In [5]:
train_subset = train[['content','polarity']][train.polarity.isin(['P','N'])]
test_subset = test[['content','polarity']][test.polarity.isin(['P','N'])]

In [6]:
train_subset.head()

Unnamed: 0,content,polarity
7,@estherct209 jajajaja la tuya y la d mucha gen...,N
8,Quiero mogollón a @AlbaBenito99 pero sobretodo...,P
9,Vale he visto la tia bebiendose su regla y me ...,N
10,@Yulian_Poe @guillermoterry1 Ah. mucho más por...,P
13,11. siiii fue super gracioso teniamos que habe...,P


## Normalize the case

In [7]:
train_subset['content_norm'] = train_subset.apply(lambda row:row['content'].lower(),axis=1)
test_subset['content_norm'] = test_subset.apply(lambda row:row['content'].lower(),axis=1)

In [8]:
train_subset.head()

Unnamed: 0,content,polarity,content_norm
7,@estherct209 jajajaja la tuya y la d mucha gen...,N,@estherct209 jajajaja la tuya y la d mucha gen...
8,Quiero mogollón a @AlbaBenito99 pero sobretodo...,P,quiero mogollón a @albabenito99 pero sobretodo...
9,Vale he visto la tia bebiendose su regla y me ...,N,vale he visto la tia bebiendose su regla y me ...
10,@Yulian_Poe @guillermoterry1 Ah. mucho más por...,P,@yulian_poe @guillermoterry1 ah. mucho más por...
13,11. siiii fue super gracioso teniamos que habe...,P,11. siiii fue super gracioso teniamos que habe...


## Remove stop words and punctuation

In [9]:
from nltk.corpus import stopwords
import string
stop_words = stopwords.words('spanish') + list(string.punctuation)

## Assemble into a tokenize function

In [10]:
def tokenize(text):
    doc = nlp(text)
    pos_lemma = [token.lemma_ if token.pos_ in ['NOUN','ADJ'] else token.text for token in doc] ##lemmatize only Noun,Adjective
    tokens = [word for word in pos_lemma if word not in stop_words] #Remove stop words and punctuation
    return tokens

In [11]:
from sklearn.feature_extraction.text import CountVectorizer  

In [12]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = False,
    max_features = 100 # we are only considering first 100 features for sentiment classification
)

In [13]:
corpus_data_features = vectorizer.fit_transform(train_subset.content_norm.tolist())
corpus_data_features_nd = corpus_data_features.toarray()

In [14]:
corpus_data_features_nd

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
corpus_data_features_nd.shape

(410, 100)

## Importing Machine Learning Algorithm Logistic Regression for classification task

In [16]:
from sklearn.linear_model import LogisticRegression

## Splitting the reviews into train and val sets in the ratio of 80:20
## train set will be used for training the model to understand the data
## val set will be used to check the performance of our model

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus_data_features_nd[0:len(train_subset)],train_subset.polarity,random_state = 42,train_size = 0.8)



## Training the model on train set

In [18]:
log_model = LogisticRegression()
log_model = log_model.fit(X = X_train, y = y_train)



## Predicting on the val set

In [19]:
y_pred = log_model.predict(X_test)

## Checking the accuracy on validation set in %

In [20]:
from sklearn.metrics import classification_report,accuracy_score

acc = accuracy_score(y_test,y_pred)
np.round(acc*100,2)

68.29

## Testing on Test Samples

In [21]:
corpus_data_features_test = vectorizer.fit_transform(test_subset.content_norm.tolist())
corpus_data_features_test_nd = corpus_data_features_test.toarray()

## Predicting on Test set

In [22]:
y_test_pred = log_model.predict(corpus_data_features_test_nd)

## Checking accuracy on test set in %

In [23]:
y_test = test_subset.polarity.values
acc = accuracy_score(y_test,y_test_pred)
np.round(acc*100,2)

58.05

## Checking output

In [24]:
test_subset['pred'] = y_test_pred

In [25]:
test_subset[['content','polarity','pred']].head()

Unnamed: 0,content,polarity,pred
0,@noseashetero 1000/10 de verdad a ti que voy a...,P,P
1,@piscolabisaereo @HistoriaNG @SPosteguillo las...,P,N
3,@Jorge_Ruiz14 yo no tengo tiempo para esas cos...,N,P
4,@_MissChaotic_ ves ese brillo? es un coso que ...,N,N
5,Tengo una perrina adorable... Sabéis que me ac...,P,P
